import json import re import os import copy import datetime import hashlib import duckdb import pandas as pd import fastparquet as fp tk_files = ['./data/2023-07', './data/2023-08', './data/2023-09', './data/2023-10', './data/2023-11', './data/2023-12'] pq_file_logs = './parquet/logs.parquet' pq_file_layers = './parquet/layers.parquet' limit = False group = 1000000 APPLICATION_FROM_PATH = { '/cas': 'cas', '/console': 'console', '/login': 'login', '/cadastrapp': 'cadastrapp', '/geoserver': 'geoserver', '/metadata': 'metadata', '/files': 'files', '/geonetwork': 'geonetwork', '/tools/mdedit': 'mdedit', '/tools/mviewer': 'mviewer', '/mviewer': 'mviewer', '/mapstore': 'mapstore', '/geocontrib': 'geocontrib', '/data4citizen': 'data4citizen', '/portail': 'cms', '/': 'root', '/robots': 'robots', } def write_parquet(data, pq_file): df_logs = pd.DataFrame(data) if not os.path.isfile(pq_file): fp.write(pq_file, df_logs, compression='GZIP') else: fp.write(pq_file, df_logs, compression='GZIP', append=True) def convert_file(tk_file, pq_file_logs, pq_file_layers, limit=False): # convert TK file to JSON file with open(tk_file, 'r', encoding="utf-8") as tkf: num_line = 0 logs = [] layers = [] # df_logs = None # df_layers = None print('TK file:', tk_file) print('0') while (num_line < limit or not limit): line = tkf.readline() if not line: write_parquet(logs, pq_file_logs) write_parquet(layers, pq_file_layers) break else: hash = hashlib.sha256(line.encode('utf-8')).hexdigest() line_json = json.loads(line) time = datetime.datetime.fromisoformat(line_json['StartUTC']) dge_application = 'other' for application in APPLICATION_FROM_PATH: if line_json['RequestPath'].startswith(application): dge_application = APPLICATION_FROM_PATH[application] break ogc_service = re.findall("[?|&]service=([a-z]*)[?|&]?", line_json['RequestPath'], re.IGNORECASE) ogc_service = ogc_service[0].lower() if ogc_service else '' ogc_request = re.findall("[?|&]request=([a-z]*)[?|&]?", line_json['RequestPath'], re.IGNORECASE) ogc_request = ogc_request[0].lower() if len(ogc_request) else '' ogc_version = re.findall("[?|&]version=([0-9.]*)[?|&]?", line_json['RequestPath'], re.IGNORECASE) ogc_version = ogc_version[0].lower() if len(ogc_version) else '' ogc_crs = re.findall("[?|&]crs=([a-z0-9.:]*)[?|&]?", line_json['RequestPath'], re.IGNORECASE) ogc_crs = ogc_crs[0].lower() if ogc_crs else '' ogc_bbox = re.findall("[?|&]bbox=([a-z0-9.:;,]*)[?|&]?", line_json['RequestPath'], re.IGNORECASE) ogc_bbox = ogc_bbox[0] if ogc_bbox else '' ogc_workspace = re.findall("/geoserver/([a-z0-9_.]*)/[a-z]", line_json['RequestPath'], re.IGNORECASE) ogc_workspace = ogc_workspace[0].lower() if len(ogc_workspace) else '' ogc_layers = re.findall("[?|&]layers=([a-z0-9_.:,-]*)[?|&|$]", line_json['RequestPath'], re.IGNORECASE) ogc_layers = ogc_layers[0] if len(ogc_layers) else '' if ogc_layers: for ogc_layer in ogc_layers.split(','): layer_parts = ogc_layer.split(':') if len(layer_parts) == 2: layer = layer_parts[1] workspace = layer_parts[0] else: layer = layer_parts[0] workspace = ogc_workspace layers.append({ 'log_hash': hash, 'timestamp': int(time.timestamp()), 'version': copy.deepcopy(ogc_version), 'application': copy.deepcopy(dge_application), 'service': copy.deepcopy(ogc_service), 'request': copy.deepcopy(ogc_request), 'workspace': copy.deepcopy(workspace), 'layer': copy.deepcopy(layer), 'crs': copy.deepcopy(ogc_crs), 'bbox': copy.deepcopy(ogc_bbox), }) log = { 'hash': hash, 'timestamp': int(time.timestamp()), 'year': time.year, 'month': time.month, 'day': time.day, 'hour': time.hour, 'minute': time.minute, 'second': time.second, 'microsecond': time.microsecond, 'org_content_size': line_json['OriginContentSize'], 'req_address': line_json['RequestAddr'], 'req_content_size': line_json['RequestContentSize'], 'req_count': line_json['RequestCount'], 'req_host': line_json['RequestHost'], 'req_method': line_json['RequestMethod'], 'req_path': line_json['RequestPath'], 'req_port': line_json['RequestPort'], 'req_protocol': line_json['RequestProtocol'], 'req_scheme': line_json['RequestScheme'], 'dge_application': dge_application, 'ogc_service': ogc_service, 'ogc_version': ogc_version, 'ogc_request': ogc_request, 'ogc_workspace': ogc_workspace, 'ogc_layers': ogc_layers, 'ogc_crs': ogc_crs, 'ogc_bbox': ogc_bbox, } logs.append(log) num_line += 1 if num_line % group == 0: print(num_line) write_parquet(logs, pq_file_logs) logs = [] write_parquet(layers, pq_file_layers) layers = [] if __name__ == "__main__": print('=' * 80) start = datetime.datetime.now() print("Start:", start.strftime("%H:%M:%S")) for tk_file in tk_files: start_table = datetime.datetime.now() print('-' * 80) if os.path.isfile(tk_file): convert_file(tk_file, pq_file_logs, pq_file_layers, limit=limit) else: print(tk_file, 'not exists.') end_table = datetime.datetime.now() print("Duration:", str(end_table - start_table).split('.')[0]) end = datetime.datetime.now() print() print("End:", end.strftime("%H:%M:%S")) print("Total duration:", str(end - start).split('.')[0]) print('=' * 80)