first commit 20240311

2024-03-11 08:36:55 +01:00 · 2024-03-11 08:36:55 +01:00 · 9b41e2350f
commit 9b41e2350f
11 changed files with 947 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,5 @@
+python
+data/
+db/
+parquet/
+results/
--- a/README.md
+++ b/README.md
@ -0,0 +1,86 @@
+# DGE Traefik logs
+
+L'objectif de ce projet est (1) de pouvoir convertir les fichiers traekik au format JSON dans un format plus adapté à leur explopitation et (2) permettre le requêtage des données pour en extraire des statistiques et indicateurs sur l'utilisation de la plateforme DataGrandEst.
+
+Dans la suite, 2 approches ont été étudiées
+
+- Utilisation d'une BDD SQLite
+- Utilisation de fichiers Parquet
+
+A noter que l'ensemble des fichiers source représente environ 113 Go de données.
+
+## BDD SQLITE
+
+C'est une approche relativement conventionnelle qui permet une mise en oeuvre assez simple.
+
+Approche retenue:
+
+- Créer une base de données sqlite3
+- Lire un fichier mensuel et l'intégrer dans la BDD (en l'adaptant au besoin)
+- Traiter les données en SQL
+
+Fonctionne sur la base de 2 scripts:
+
+1. `sqlite_import.py` = importer les logs en BDD SQLite
+2. `sqlite_stats.py` = calculer les principales stats mensuelles
+
+### Import
+
+cf. `sqlite_import.py`
+
+Chaque fichier de log est importé dans une table mensuelle spécifique, ligne par ligne.  
+Ex.: le fichier "2023-07" est importé dans le table "LOGS_2023_07" et ainsi de suite.  
+En parallèle, les requêtes WMS sont importées dans une table dédiée "LAYERS_2023_07" (sur la base de l'exemple précédent). L'objectif est de pouvoir à terme disposer de statistiques sur les couches les plus consultées.  
+Le même travail reste à faire pour les couches WFS et WMTS.
+
+L'import de l'ensemble des fichiers dure environ 1 heure. Le fichier final fait environ 56Go.
+
+``` txt
+2023-07 - 8.26 GB - 0:05:24
+2023-08 - 16.29 GB - 0:09:42
+2023-09 - 20.12 GB - 0:11:45
+2023-10 - 23.26 GB - 0:12:44
+2023-11 - 25.97 GB - 0:13:45
+2023-12 - 19.7 GB - 0:10:17
+```
+
+### Stats
+
+cf. `sqlite_stats.py`
+
+Ce script calcule des stats sur la base de requêtes de type SELECT sur la BDD.
+Le résultat est présenté sous la forme d'un fichier CSV. Un copier/coller permet de l'intégrer dans un tableau Excel.
+
+La calcul des dure environ 5 à 7 minutes. 
+
+## Fomrat PARQUET
+
+La création d'un fichier parquet n'est pas évidente.
+
+### SQLITE to PARQUET
+
+cf. `sqlite_to_parquet.py`
+
+La première tentative a conduit à créer un fichier parquet à partir du fichier SQLite (cf. `sqlite_to_parquet.py`).
+Cela reste assez simple, mais nécessite de passer par une étape initiale assez longue de génération du fichier SQLITE (cf. supra). On obtient alors un fichier parquet par table de la base de données.
+L'exploitation se fait via `parquet_sqlite_stats.py`.
+
+### JSON to PARQUET
+
+cf. `parquet_import.py`
+
+Un autre test a été réalisé en intégrant les logs dans le fichier parquet sur le modèle de ce qui a été fait pour la BDD SQLite (cf. json_to_parquet.py).
+L'opération est relativement longue mais présente 2 avantages:
+
+- On obtient au final 2 fichiers parquets "logs.parquet" et "layers.parquet" avec l'ensemble des données
+- La taille des fichiers est réduite : 8 Go au total pour les 2 fichiers alors que la base SQLite fait 56 Go et les données sources en JSON 113 Go. 
+
+La durée de convertion est d'environ 1h30.
+
+L'exploitation des données est réalisée avec le fichier "parquet_stats.py".  
+cf. `parquet_stats.py`
+
+## TODO
+
+- [ ] Intégrer dans "layers.parquet" les requêtes provenant de flux WFS.
+- [ ] Donner la possibilité de générer le fichier "layers.parquet" à parir de "logs.parquet" pour faciliter la mise à jour.
--- a/parquet_import.py
+++ b/parquet_import.py
@ -0,0 +1,180 @@
+import json
+import re
+import os
+import copy
+import datetime
+import hashlib
+import duckdb
+import pandas as pd
+import fastparquet as fp
+
+tk_files = ['./data/2023-07', './data/2023-08', './data/2023-09', './data/2023-10', './data/2023-11', './data/2023-12']
+pq_file_logs = './parquet/logs.parquet'
+pq_file_layers = './parquet/layers.parquet'
+limit = False
+group = 1000000
+
+APPLICATION_FROM_PATH = {
+    '/cas': 'cas',
+    '/console': 'console',
+    '/login': 'login',
+    '/cadastrapp': 'cadastrapp',
+    '/geoserver': 'geoserver',
+    '/metadata': 'metadata',
+    '/files': 'files',
+    '/geonetwork': 'geonetwork',
+    '/tools/mdedit': 'mdedit',
+    '/tools/mviewer': 'mviewer',
+    '/mviewer': 'mviewer',
+    '/mapstore': 'mapstore',
+    '/geocontrib': 'geocontrib',
+    '/data4citizen': 'data4citizen',
+    '/portail': 'cms',
+    '/': 'root',
+    '/robots': 'robots',
+}
+
+
+def write_parquet(data, pq_file):
+    df_logs = pd.DataFrame(data)
+    if not os.path.isfile(pq_file):
+        fp.write(pq_file, df_logs, compression='GZIP')
+    else:
+        fp.write(pq_file, df_logs, compression='GZIP', append=True)
+
+
+def convert_file(tk_file, pq_file_logs, pq_file_layers, limit=False):
+    
+    # convert TK file to JSON file
+    with open(tk_file, 'r', encoding="utf-8") as tkf:
+        
+        num_line = 0
+        logs = []
+        layers = []
+        # df_logs = None
+        # df_layers = None
+        
+        print('TK file:', tk_file)
+        print('0')
+        
+        while (num_line < limit or not limit):
+            line = tkf.readline()
+            
+            if not line:
+                write_parquet(logs, pq_file_logs)
+                write_parquet(layers, pq_file_layers)
+                break
+            
+            else:
+                hash = hashlib.sha256(line.encode('utf-8')).hexdigest()
+                line_json = json.loads(line)
+                time = datetime.datetime.fromisoformat(line_json['StartUTC'])
+                
+                dge_application = 'other'
+                for application in APPLICATION_FROM_PATH:
+                    if line_json['RequestPath'].startswith(application):
+                        dge_application = APPLICATION_FROM_PATH[application]
+                        break
+                ogc_service = re.findall("[?|&]service=([a-z]*)[?|&]?", line_json['RequestPath'], re.IGNORECASE)
+                ogc_service = ogc_service[0].lower() if ogc_service else ''
+                ogc_request = re.findall("[?|&]request=([a-z]*)[?|&]?", line_json['RequestPath'], re.IGNORECASE)
+                ogc_request = ogc_request[0].lower() if len(ogc_request) else ''
+                ogc_version = re.findall("[?|&]version=([0-9.]*)[?|&]?", line_json['RequestPath'], re.IGNORECASE)
+                ogc_version = ogc_version[0].lower() if len(ogc_version) else ''
+                ogc_crs = re.findall("[?|&]crs=([a-z0-9.:]*)[?|&]?", line_json['RequestPath'], re.IGNORECASE)
+                ogc_crs = ogc_crs[0].lower() if ogc_crs else ''
+                ogc_bbox = re.findall("[?|&]bbox=([a-z0-9.:;,]*)[?|&]?", line_json['RequestPath'], re.IGNORECASE)
+                ogc_bbox = ogc_bbox[0] if ogc_bbox else ''
+                ogc_workspace = re.findall("/geoserver/([a-z0-9_.]*)/[a-z]", line_json['RequestPath'], re.IGNORECASE)
+                ogc_workspace = ogc_workspace[0].lower() if len(ogc_workspace) else ''
+                ogc_layers = re.findall("[?|&]layers=([a-z0-9_.:,-]*)[?|&|$]", line_json['RequestPath'], re.IGNORECASE)
+                ogc_layers = ogc_layers[0] if len(ogc_layers) else ''
+
+                if ogc_layers:
+                    for ogc_layer in ogc_layers.split(','):
+                        layer_parts = ogc_layer.split(':')
+                        if len(layer_parts) == 2:
+                            layer = layer_parts[1]
+                            workspace = layer_parts[0]
+                        else:
+                            layer = layer_parts[0]
+                            workspace = ogc_workspace
+                    
+                        layers.append({
+                            'log_hash': hash,
+                            'timestamp': int(time.timestamp()),
+                            'version': copy.deepcopy(ogc_version),
+                            'application': copy.deepcopy(dge_application),
+                            'service': copy.deepcopy(ogc_service),
+                            'request': copy.deepcopy(ogc_request),
+                            'workspace': copy.deepcopy(workspace),
+                            'layer': copy.deepcopy(layer),
+                            'crs': copy.deepcopy(ogc_crs),
+                            'bbox': copy.deepcopy(ogc_bbox),
+                        })
+
+                log = {
+                    'hash': hash,
+                    'timestamp': int(time.timestamp()),
+                    'year': time.year,
+                    'month': time.month,
+                    'day': time.day,
+                    'hour': time.hour,
+                    'minute': time.minute,
+                    'second': time.second,
+                    'microsecond': time.microsecond,
+                    'org_content_size': line_json['OriginContentSize'],
+                    'req_address': line_json['RequestAddr'],
+                    'req_content_size': line_json['RequestContentSize'],
+                    'req_count': line_json['RequestCount'],
+                    'req_host': line_json['RequestHost'],
+                    'req_method': line_json['RequestMethod'],
+                    'req_path': line_json['RequestPath'],
+                    'req_port': line_json['RequestPort'],
+                    'req_protocol': line_json['RequestProtocol'],
+                    'req_scheme': line_json['RequestScheme'],
+                    'dge_application': dge_application,
+                    'ogc_service': ogc_service,
+                    'ogc_version': ogc_version,
+                    'ogc_request': ogc_request,
+                    'ogc_workspace': ogc_workspace,
+                    'ogc_layers': ogc_layers,
+                    'ogc_crs': ogc_crs,
+                    'ogc_bbox': ogc_bbox,
+                }
+                
+                logs.append(log)
+
+                
+            num_line += 1
+            
+            if num_line % group == 0:
+                print(num_line)
+                write_parquet(logs, pq_file_logs)
+                logs = []
+                write_parquet(layers, pq_file_layers)
+                layers = []
+
+
+if __name__ == "__main__":
+    print('=' * 80)
+    start = datetime.datetime.now()
+    print("Start:", start.strftime("%H:%M:%S"))
+    
+    for tk_file in tk_files:
+        start_table = datetime.datetime.now()
+        print('-' * 80)
+
+        if os.path.isfile(tk_file):
+            convert_file(tk_file, pq_file_logs, pq_file_layers, limit=limit)
+        else:
+            print(tk_file, 'not exists.')
+            
+        end_table = datetime.datetime.now()
+        print("Duration:", str(end_table - start_table).split('.')[0])
+
+    end = datetime.datetime.now()
+    print()
+    print("End:", end.strftime("%H:%M:%S"))
+    print("Total duration:", str(end - start).split('.')[0])
+    print('=' * 80)
--- a/parquet_sqlite_stats.py
+++ b/parquet_sqlite_stats.py
@ -0,0 +1,73 @@
+import datetime
+
+import duckdb
+
+TABLES = ['LOGS_2023_07', 'LOGS_2023_08', 'LOGS_2023_09', 'LOGS_2023_10', 'LOGS_2023_11', 'LOGS_2023_12']
+
+sql_select_nb_lines =                "SELECT COUNT(id) AS nb_lines FROM read_parquet('{table}');"
+sql_select_nb_requests =             "SELECT COUNT(id) AS nb_requests FROM read_parquet('{table}');"
+sql_select_nb_requests_by_service =  "SELECT ogc_service AS service, COUNT(id) AS requests FROM read_parquet('{table}') WHERE dge_application IN ('geoserver', 'geonetwork') and ogc_service IN ('wms', 'wfs', 'wmts', 'csw') GROUP BY ogc_service ORDER BY ogc_service;"
+sql_select_content_size =            "SELECT SUM(CAST(org_content_size as INTEGER)) AS size FROM read_parquet('{table}');"
+sql_select_content_size_by_service = "SELECT ogc_service AS service, SUM(CAST(org_content_size as INTEGER)) AS size FROM read_parquet('{table}') WHERE dge_application IN ('geoserver', 'geonetwork') and ogc_service IN ('wms', 'wfs', 'wmts', 'csw') GROUP BY ogc_service ORDER BY ogc_service;"
+
+
+def read_tables(tables):
+    
+    result = {}
+
+    for table in tables:
+        
+        pq_file = './parquet/' + table + '.parquet'
+        
+        result[table] = {}
+        
+        result[table]['nb_lines'] = duckdb.sql(sql_select_nb_lines.format(table=pq_file)).fetchone()[0]
+        result[table]['nb_requests_all'] = duckdb.sql(sql_select_nb_requests.format(table=pq_file)).fetchone()[0]
+        nb_requests_by_service = duckdb.sql(sql_select_nb_requests_by_service.format(table=pq_file)).fetchall()
+        result[table]['nb_requests_other'] = 0
+        result[table]['nb_requests_services'] = 0
+        for row in nb_requests_by_service:
+            result[table]['nb_' + row[0]] = row[1]
+            result[table]['nb_requests_services'] += row[1]
+        result[table]['size_all'] = duckdb.sql(sql_select_content_size.format(table=pq_file)).fetchone()[0]
+        size_by_service = duckdb.sql(sql_select_content_size_by_service.format(table=pq_file)).fetchall()
+        result[table]['size_other'] = 0
+        result[table]['size_services'] = 0
+        for row in size_by_service:
+            result[table]['size_' + row[0]] = row[1]
+            result[table]['size_services'] += row[1]
+        result[table]['nb_requests_other'] = result[table]['nb_requests_all'] - result[table]['nb_requests_services']
+        result[table]['size_other'] = result[table]['size_all'] - result[table]['size_services']
+
+    return result
+
+
+def print_result(result):
+    print('-'*80)
+    lines = []
+    first = True
+    for table in result:
+        if first:
+            line = ['TABLE']
+            line.extend([str(key).upper() for key in result[table]])
+            lines.append(','.join(line))
+            first = False
+        line = [table]
+        line.extend([str(result[table][key]) for key in result[table]])
+        lines.append(','.join(line))
+    print('\n'.join(lines))
+    print('-'*80)
+
+
+if __name__ == "__main__":
+    print('=' * 80)
+    start = datetime.datetime.now()
+    print("Start:", start.strftime("%H:%M:%S"))
+
+    result = read_tables(tables=TABLES)
+    print_result(result)
+
+    end = datetime.datetime.now()
+    print("End:", end.strftime("%H:%M:%S"))
+    print("Total duration:", str(end - start).split('.')[0])
+    print('=' * 80)
--- a/parquet_stats.py
+++ b/parquet_stats.py
@ -0,0 +1,72 @@
+import datetime
+
+import duckdb
+
+pq_file = './parquet/logs.parquet'
+
+sql_select_by_month =            "SELECT year AS year, month AS month, COUNT(*) AS nb_lines, SUM(CAST(org_content_size as INTEGER)) AS size FROM read_parquet('{table}') GROUP BY year, month;"
+sql_select_requests_by_service = "SELECT year AS year, month, ogc_service AS service, COUNT(*) AS requests FROM read_parquet('{table}') WHERE dge_application IN ('geoserver', 'geonetwork') and ogc_service IN ('wms', 'wfs', 'wmts', 'csw') GROUP BY year, month, ogc_service ORDER BY year, month, ogc_service;"
+sql_select_size_by_service =     "SELECT year AS year, month, ogc_service AS service, SUM(CAST(org_content_size as INTEGER)) AS size FROM read_parquet('{table}') WHERE dge_application IN ('geoserver', 'geonetwork') and ogc_service IN ('wms', 'wfs', 'wmts', 'csw') GROUP BY year, month, ogc_service ORDER BY year, month, ogc_service;"
+
+
+def get_data(pq_file):
+
+    df_total = duckdb.sql(sql_select_by_month.format(table=pq_file)).df()
+    df_requests_by_service = duckdb.sql(sql_select_requests_by_service.format(table=pq_file)).df()
+    df_size_by_service = duckdb.sql(sql_select_size_by_service.format(table=pq_file)).df()
+   
+    result= {}
+    
+    for index, row in df_total.iterrows():
+        month = str(int(row['year'])) + '_' + str(int(row['month'])).rjust(2, '0')
+        result[month] = {
+            'nb_all': row['nb_lines'],
+            'size_all': row['size'],            
+        }
+        
+    for index, row in df_requests_by_service.iterrows():
+        month = str(int(row['year'])) + '_' + str(int(row['month'])).rjust(2, '0')
+        result[month]['nb_services'] = 0 if 'nb_services' not in result[month] else result[month]['nb_services'] + row['requests']
+        result[month]['nb_' + row['service']] = row['requests']
+    
+    for index, row in df_size_by_service.iterrows():
+        month = str(int(row['year'])) + '_' + str(int(row['month'])).rjust(2, '0')
+        result[month]['size_services'] = 0 if 'size_services' not in result[month] else result[month]['size_services'] + row['size']
+        result[month]['size_' + row['service']] = row['size']
+    
+    for row in result:
+        result[row]['nb_other'] = result[row]['nb_all'] - result[row]['nb_services']
+        result[row]['size_other'] = result[row]['size_all'] - result[row]['size_services']
+       
+    return result
+
+
+def print_result(result):
+    print('-'*80)
+    lines = []
+    first = True
+    for month in result:
+        if first:
+            line = ['MONTH']
+            line.extend([str(key).upper() for key in result[month]])
+            lines.append(','.join(line))
+            first = False
+        line = [month]
+        line.extend([str(int(result[month][key])) for key in result[month]])
+        lines.append(','.join(line))
+    print('\n'.join(lines))
+    print('-'*80)
+
+
+if __name__ == "__main__":
+    print('=' * 80)
+    start = datetime.datetime.now()
+    print("Start:", start.strftime("%H:%M:%S"))
+
+    result = get_data(pq_file)
+    print_result(result)
+
+    end = datetime.datetime.now()
+    print("End:", end.strftime("%H:%M:%S"))
+    print("Total duration:", str(end - start).split('.')[0])
+    print('=' * 80)
--- a/sql/create_indexes.sql
+++ b/sql/create_indexes.sql
@ -0,0 +1,36 @@
+CREATE UNIQUE INDEX "ix_logs_2023_07_id" ON "LOGS_2023_07" (
+    "id"
+);
+CREATE INDEX "ix_logs_2023_07_service" ON "LOGS_2023_07" (
+	"ogc_service"
+);
+CREATE UNIQUE INDEX "ix_logs_2023_08_id" ON "LOGS_2023_08" (
+    "id"
+);
+CREATE INDEX "ix_logs_2023_08_service" ON "LOGS_2023_08" (
+	"ogc_service"
+);
+CREATE UNIQUE INDEX "ix_logs_2023_09_id" ON "LOGS_2023_09" (
+    "id"
+);
+CREATE INDEX "ix_logs_2023_09_service" ON "LOGS_2023_09" (
+	"ogc_service"
+);
+CREATE UNIQUE INDEX "ix_logs_2023_10_id" ON "LOGS_2023_10" (
+    "id"
+);
+CREATE INDEX "ix_logs_2023_10_service" ON "LOGS_2023_10" (
+	"ogc_service"
+);
+CREATE UNIQUE INDEX "ix_logs_2023_11_id" ON "LOGS_2023_11" (
+    "id"
+);
+CREATE INDEX "ix_logs_2023_11_service" ON "LOGS_2023_11" (
+	"ogc_service"
+);
+CREATE UNIQUE INDEX "ix_logs_2023_12_id" ON "LOGS_2023_12" (
+    "id"
+);
+CREATE INDEX "ix_logs_2023_12_service" ON "LOGS_2023_12" (
+	"ogc_service"
+);
--- a/sql/dge_traefik_logs.sql
+++ b/sql/dge_traefik_logs.sql
@ -0,0 +1,55 @@
+DROP TABLE IF EXISTS "main"."{table}";
+
+CREATE TABLE IF NOT EXISTS "main"."{table}" (
+    "id" INTEGER NOT NULL UNIQUE,
+    "hash" TEXT,
+    "timestamp" INTEGER,
+    "year" INTEGER,
+    "month" INTEGER,
+    "day" INTEGER,
+    "hour" INTEGER,
+    "minute" INTEGER,
+    "second" INTEGER,
+    "microsecond" INTEGER,
+    "org_content_size" TEXT,
+    "req_address" TEXT,
+    "req_content_size" INTEGER,
+    "req_count" INTEGER,
+    "req_host" TEXT,
+    "req_method" TEXT,
+    "req_path" TEXT,
+    "req_port" TEXT,
+    "req_protocol" TEXT,
+    "req_scheme" TEXT,
+    "ogc_service" TEXT,
+    "ogc_workspace" TEXT,
+    "ogc_layers" TEXT,
+    "ogc_request" TEXT,
+    "ogc_epsg" TEXT,
+    PRIMARY KEY("id" AUTOINCREMENT)
+);
+
+INSERT INTO "main"."{table}"
+("hash", "timestamp", "year", "month", "day", "hour", "minute", "microsecond", "org_content_size", "req_address", "req_content_size", "req_count", "req_host", "req_method", "req_path", "req_port", "req_protocol", "req_scheme", "ogc_service", "ogc_workspace", "ogc_layers", "ogc_request", "ogc_epsg")
+VALUES 
+(:hash, :timestamp, :year, :month, :day, :hour, :minute, :microsecond, :org_content_size, :req_address, :req_content_size, :req_count, :req_host, :req_method, :req_path, :req_port, :req_protocol, :req_scheme, :ogc_service, :ogc_workspace, :ogc_layers, :ogc_request, :ogc_epsg);
+
+SELECT id, hash, timestamp, year, month, day, hour, minute, microsecond, org_content_size, req_address, req_content_size, req_count, req_host, req_method, req_path, req_port, req_protocol, req_scheme, ogc_service, ogc_workspace, ogc_layers, ogc_request, ogc_epsg FROM {table};
+
+SELECT id, hash, timestamp, year, month, day, hour, minute, microsecond, org_content_size, req_address, req_content_size, req_count, req_host, req_method, req_path, req_port, req_protocol, req_scheme, ogc_service, ogc_workspace, ogc_layers, ogc_request, ogc_epsg FROM {table} WHERE service <> '';
+
+SELECT id, hash, timestamp, year, month, day, hour, minute, microsecond, org_content_size, req_address, req_content_size, req_count, req_host, req_method, req_path, req_port, req_protocol, req_scheme, ogc_service, ogc_workspace, ogc_layers, ogc_request, ogc_epsg FROM {table} WHERE service = '';
+
+SELECT id, hash, timestamp, year, month, day, hour, minute, microsecond, org_content_size, req_address, req_content_size, req_count, req_host, req_method, req_path, req_port, req_protocol, req_scheme, ogc_service, ogc_workspace, ogc_layers, ogc_request, ogc_epsg FROM {table} WHERE request = 'getmap';
+
+
+CREATE TABLE IF NOT EXISTS "main"."{table}" (
+    "id" INTEGER NOT NULL UNIQUE,
+    "id_logs" INTEGER,
+    "service" TEXT,
+    "workspace" TEXT,
+    "layer" TEXT,
+    "request" TEXT,
+    "epsg" TEXT,
+    PRIMARY KEY("id" AUTOINCREMENT)
+);
--- a/sql/update_to_lower.sql
+++ b/sql/update_to_lower.sql
@ -0,0 +1,26 @@
+UPDATE LOGS_2023_07
+SET ogc_service = LOWER(ogc_service), ogc_request = LOWER(ogc_request), ogc_crs = LOWER(ogc_crs), ogc_workspace = LOWER(ogc_workspace);
+UPDATE LOGS_2023_08
+SET ogc_service = LOWER(ogc_service), ogc_request = LOWER(ogc_request), ogc_crs = LOWER(ogc_crs), ogc_workspace = LOWER(ogc_workspace);
+UPDATE LOGS_2023_09
+SET ogc_service = LOWER(ogc_service), ogc_request = LOWER(ogc_request), ogc_crs = LOWER(ogc_crs), ogc_workspace = LOWER(ogc_workspace);
+UPDATE LOGS_2023_10
+SET ogc_service = LOWER(ogc_service), ogc_request = LOWER(ogc_request), ogc_crs = LOWER(ogc_crs), ogc_workspace = LOWER(ogc_workspace);
+UPDATE LOGS_2023_11
+SET ogc_service = LOWER(ogc_service), ogc_request = LOWER(ogc_request), ogc_crs = LOWER(ogc_crs), ogc_workspace = LOWER(ogc_workspace);
+UPDATE LOGS_2023_12
+SET ogc_service = LOWER(ogc_service), ogc_request = LOWER(ogc_request), ogc_crs = LOWER(ogc_crs), ogc_workspace = LOWER(ogc_workspace);
+UPDATE LOGS_2023_12
+SET ogc_service = LOWER(ogc_service), ogc_request = LOWER(ogc_request), ogc_crs = LOWER(ogc_crs), ogc_workspace = LOWER(ogc_workspace);
+UPDATE LAYERS_2023_07
+SET service = LOWER(service), request = LOWER(request), crs = LOWER(crs), workspace = LOWER(workspace);
+UPDATE LAYERS_2023_08
+SET service = LOWER(service), request = LOWER(request), crs = LOWER(crs), workspace = LOWER(workspace);
+UPDATE LAYERS_2023_09
+SET service = LOWER(service), request = LOWER(request), crs = LOWER(crs), workspace = LOWER(workspace);
+UPDATE LAYERS_2023_10
+SET service = LOWER(service), request = LOWER(request), crs = LOWER(crs), workspace = LOWER(workspace);
+UPDATE LAYERS_2023_11
+SET service = LOWER(service), request = LOWER(request), crs = LOWER(crs), workspace = LOWER(workspace);
+UPDATE LAYERS_2023_12
+SET service = LOWER(service), request = LOWER(request), crs = LOWER(crs), workspace = LOWER(workspace);
--- a/sqlite_import.py
+++ b/sqlite_import.py
@ -0,0 +1,273 @@
+import os
+import math
+import json
+import datetime
+import sqlite3
+import hashlib
+import re
+import copy
+
+DB_FILE = './db/dge_traefik_logs.db3'
+DATA_DIRECTORY = './data/'
+TK_DATES = ['2023-07', '2023-08', '2023-09', '2023-10', '2023-11', '2023-12']
+LIMIT = False
+
+APPLICATION_FROM_PATH = {
+    '/cas': 'cas',
+    '/console': 'console',
+    '/login': 'login',
+    '/cadastrapp': 'cadastrapp',
+    '/geoserver': 'geoserver',
+    '/metadata': 'metadata',
+    '/files': 'files',
+    '/geonetwork': 'geonetwork',
+    '/tools/mdedit': 'mdedit',
+    '/tools/mviewer': 'mviewer',
+    '/mviewer': 'mviewer',
+    '/mapstore': 'mapstore',
+    '/geocontrib': 'geocontrib',
+    '/data4citizen': 'data4citizen',
+    '/portail': 'cms',
+    '/': 'root',
+    '/robots': 'robots',
+}
+
+
+sql_drop_table = '''DROP TABLE IF EXISTS "main"."{table}";'''
+
+sql_create_table_logs = '''
+                        CREATE TABLE IF NOT EXISTS "main"."{table}" (
+                            "id" INTEGER NOT NULL UNIQUE,
+                            "hash" TEXT,
+                            "timestamp" INTEGER,
+                            "year" INTEGER,
+                            "month" INTEGER,
+                            "day" INTEGER,
+                            "hour" INTEGER,
+                            "minute" INTEGER,
+                            "second" INTEGER,
+                            "microsecond" INTEGER,
+                            "org_content_size" TEXT,
+                            "req_address" TEXT,
+                            "req_content_size" INTEGER,
+                            "req_count" INTEGER,
+                            "req_host" TEXT,
+                            "req_method" TEXT,
+                            "req_path" TEXT,
+                            "req_port" TEXT,
+                            "req_protocol" TEXT,
+                            "req_scheme" TEXT,
+                            "dge_application" TEXT,
+                            "ogc_service" TEXT,
+                            "ogc_version" TEXT,
+                            "ogc_workspace" TEXT,
+                            "ogc_layers" TEXT,
+                            "ogc_request" TEXT,
+                            "ogc_crs" TEXT,
+                            "ogc_bbox" TEXT,
+                            PRIMARY KEY("id" AUTOINCREMENT)
+                        );
+                        '''
+
+sql_insert_logs = '''
+                 INSERT INTO "main"."{table}"
+                 ("hash", "timestamp", "year", "month", "day", "hour", "minute", "microsecond", "org_content_size", "req_address", "req_content_size", "req_count", "req_host", "req_method", "req_path", "req_port", "req_protocol", "req_scheme", "dge_application", "ogc_service", "ogc_version", "ogc_workspace", "ogc_layers", "ogc_request", "ogc_crs", "ogc_bbox")
+                 VALUES 
+                 (:hash, :timestamp, :year, :month, :day, :hour, :minute, :microsecond, :org_content_size, :req_address, :req_content_size, :req_count, :req_host, :req_method, :req_path, :req_port, :req_protocol, :req_scheme, :dge_application, :ogc_service, :ogc_version, :ogc_workspace, :ogc_layers, :ogc_request, :ogc_crs, :ogc_bbox);
+                 '''
+                 
+sql_create_table_layers = '''
+                            CREATE TABLE IF NOT EXISTS "main"."{table}" (
+                                "id" INTEGER NOT NULL UNIQUE,
+                                "id_logs" INTEGER,
+                                "service" TEXT,
+                                "version" TEXT,
+                                "workspace" TEXT,
+                                "layer" TEXT,
+                                "request" TEXT,
+                                "crs" TEXT,
+                                "bbox" TEXT,
+                                PRIMARY KEY("id" AUTOINCREMENT)
+                            );
+                        '''
+
+sql_insert_layers = '''
+                 INSERT INTO "main"."{table}"
+                 ("id_logs", "service", "version", "workspace", "layer", "request", "crs", "bbox")
+                 VALUES 
+                 (:id_logs, :service, :version, :workspace, :layer, :request, :crs, :bbox);
+                 '''
+                 
+sql_create_id_index = '''
+                    CREATE UNIQUE INDEX "ix_logs_{tk_date}_id" ON "{table}" (
+                        "id"
+                    );
+                    '''
+
+sql_create_service_index = '''
+                    CREATE INDEX "ix_logs_{tk_date}_service" ON "{table}" (
+                        "ogc_service"
+                    );
+                    '''
+                    
+sql_create_application_index = '''
+                    CREATE INDEX "ix_logs_{tk_date}_application" ON "{table}" (
+                        "dge_application"
+                    );
+                    '''
+                 
+                 
+def convert_size(size_bytes):
+    if size_bytes == 0:
+        return "0B"
+    size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
+    i = int(math.floor(math.log(size_bytes, 1024)))
+    p = math.pow(1024, i)
+    s = round(size_bytes / p, 2)
+    return "%s %s" % (s, size_name[i])
+
+
+def import_tk_file(db_file, tk_file, limit=False):
+    tk_file_size = os.path.getsize(tk_file)
+    print(tk_file, '-', convert_size(tk_file_size))
+    
+    with sqlite3.connect(DB_FILE) as conn:
+        c = conn.cursor()
+
+        tk_date = os.path.basename(tk_file).replace('-', '_')
+        tables = [
+            'LOGS_' + tk_date,
+            'LAYERS_' + tk_date
+        ]
+
+        # Drop tables
+        for table in tables:
+            c.execute(sql_drop_table.format(table=table))       
+        # Create tables
+        c.execute(sql_create_table_logs.format(table=tables[0]))
+        c.execute(sql_create_table_layers.format(table=tables[1]))
+        c.execute(sql_create_service_index.format(tk_date=tk_date, table=tables[0]))
+        c.execute(sql_create_application_index.format(tk_date=tk_date, table=tables[0]))
+        conn.commit()
+
+        with open(tk_file, 'r', encoding="utf-8") as tkf:
+            num_line = 0
+            while (num_line < limit or not limit):
+                line = tkf.readline()
+                if not line:
+                    break
+                else:
+                    hash = hashlib.sha256(line.encode('utf-8')).hexdigest()
+                    line_json = json.loads(line)
+                    time = datetime.datetime.fromisoformat(line_json['StartUTC'])
+                            
+                    dge_application = 'other'
+                    for application in APPLICATION_FROM_PATH:
+                        if line_json['RequestPath'].startswith(application):
+                            dge_application = APPLICATION_FROM_PATH[application]
+                            break
+                    ogc_service = re.findall("[?|&]service=([a-z]*)[?|&]?", line_json['RequestPath'], re.IGNORECASE)
+                    ogc_service = ogc_service[0].lower() if ogc_service else ''
+                    ogc_request = re.findall("[?|&]request=([a-z]*)[?|&]?", line_json['RequestPath'], re.IGNORECASE)
+                    ogc_request = ogc_request[0].lower() if len(ogc_request) else ''
+                    ogc_version = re.findall("[?|&]version=([0-9.]*)[?|&]?", line_json['RequestPath'], re.IGNORECASE)
+                    ogc_version = ogc_version[0].lower() if len(ogc_version) else ''
+                    ogc_crs = re.findall("[?|&]crs=([a-z0-9.:]*)[?|&]?", line_json['RequestPath'], re.IGNORECASE)
+                    ogc_crs = ogc_crs[0].lower() if ogc_crs else ''
+                    ogc_bbox = re.findall("[?|&]bbox=([a-z0-9.:;,]*)[?|&]?", line_json['RequestPath'], re.IGNORECASE)
+                    ogc_bbox = ogc_bbox[0] if ogc_bbox else ''
+                    ogc_workspace = re.findall("/geoserver/([a-z0-9_.]*)/[a-z]", line_json['RequestPath'], re.IGNORECASE)
+                    ogc_workspace = ogc_workspace[0].lower() if len(ogc_workspace) else ''
+                    ogc_layers = re.findall("[?|&]layers=([a-z0-9_.:,-]*)[?|&|$]", line_json['RequestPath'], re.IGNORECASE)
+                    ogc_layers = ogc_layers[0] if len(ogc_layers) else ''
+                    layers = []
+                    if ogc_layers:
+                        for ogc_layer in ogc_layers.split(','):
+                            layer_parts = ogc_layer.split(':')
+                            if len(layer_parts) == 2:
+                                layer = layer_parts[1]
+                                workspace = layer_parts[0]
+                            else:
+                                layer = layer_parts[0]
+                                workspace = ogc_workspace
+                        
+                            layers.append({
+                                'version': copy.deepcopy(ogc_version),
+                                'service': copy.deepcopy(ogc_service),
+                                'request': copy.deepcopy(ogc_request),
+                                'workspace': copy.deepcopy(workspace),
+                                'layer': copy.deepcopy(layer),
+                                'crs': copy.deepcopy(ogc_crs),
+                                'bbox': copy.deepcopy(ogc_bbox),
+                            })  
+
+                    data = {
+                        'hash': hash,
+                        'timestamp': int(time.timestamp()),
+                        'year': time.year,
+                        'month': time.month,
+                        'day': time.day,
+                        'hour': time.hour,
+                        'minute': time.minute,
+                        'second': time.second,
+                        'microsecond': time.microsecond,
+                        'org_content_size': line_json['OriginContentSize'],
+                        'req_address': line_json['RequestAddr'],
+                        'req_content_size': line_json['RequestContentSize'],
+                        'req_count': line_json['RequestCount'],
+                        'req_host': line_json['RequestHost'],
+                        'req_method': line_json['RequestMethod'],
+                        'req_path': line_json['RequestPath'],
+                        'req_port': line_json['RequestPort'],
+                        'req_protocol': line_json['RequestProtocol'],
+                        'req_scheme': line_json['RequestScheme'],
+                        'dge_application': dge_application,
+                        'ogc_service': ogc_service,
+                        'ogc_version': ogc_version,
+                        'ogc_request': ogc_request,
+                        'ogc_workspace': ogc_workspace,
+                        'ogc_layers': ogc_layers,
+                        'ogc_crs': ogc_crs,
+                        'ogc_bbox': ogc_bbox,
+                    }
+                
+                    # Insert log line
+                    c.execute(sql_insert_logs.format(table=tables[0]), data)
+                    id_logs = c.lastrowid
+                    
+                    # Insert layers of log
+                    for layer in layers:
+                        layer['id_logs'] = id_logs
+                        c.execute(sql_insert_layers.format(table=tables[1]), layer)
+                    
+                    num_line += 1
+
+        conn.commit()
+        
+        print("Nb lines:", num_line)
+        now = datetime.datetime.now()
+        print("Duration:", str(now - start).split('.')[0])
+
+
+if __name__ == "__main__":
+    print('=' * 80)
+    start = datetime.datetime.now()
+    print("Start:", start.strftime("%H:%M:%S"))
+
+
+    for tk_date in TK_DATES:
+        start_table = datetime.datetime.now()
+        print('-' * 80)
+        tk_file = DATA_DIRECTORY + tk_date
+        if os.path.isfile(tk_file):
+            import_tk_file(db_file=DB_FILE, tk_file=tk_file, limit=LIMIT)
+        else:
+            print(tk_file, 'not exists.')
+        end_table = datetime.datetime.now()
+        print("Duration:", str(end_table - start_table).split('.')[0])
+
+    print('-' * 80)
+    end = datetime.datetime.now()
+    print("End:", end.strftime("%H:%M:%S"))
+    print("Total duration:", str(end - start).split('.')[0])
+    print('=' * 80)
--- a/sqlite_stats.py
+++ b/sqlite_stats.py
@ -0,0 +1,100 @@
+import sqlite3
+import datetime
+
+DB_FILE = './dge_traefik_logs.db3'
+TABLES = ['LOGS_2023_07', 'LOGS_2023_08', 'LOGS_2023_09', 'LOGS_2023_10', 'LOGS_2023_11', 'LOGS_2023_12']
+
+sql_create_id_index = '''
+                    CREATE UNIQUE INDEX IF NOT EXISTS "ix_logs_{tk_date}_id" ON "{table}" (
+                        "id"
+                    );
+                    '''
+
+sql_create_service_index = '''
+                    CREATE INDEX IF NOT EXISTS "ix_logs_{tk_date}_service" ON "{table}" (
+                        "ogc_service"
+                    );
+                    '''
+                    
+sql_create_application_index = '''
+                    CREATE INDEX IF NOT EXISTS "ix_logs_{tk_date}_application" ON "{table}" (
+                        "dge_application"
+                    );
+                    '''
+
+sql_select_nb_lines =                "SELECT COUNT(id) AS nb_lines FROM {table};"
+sql_select_nb_requests =             "SELECT COUNT(id) AS nb_requests FROM {table};"
+sql_select_nb_requests_by_service =  "SELECT ogc_service AS service, COUNT(id) AS requests FROM {table} WHERE dge_application IN ('geoserver', 'geonetwork') and ogc_service IN ('wms', 'wfs', 'wmts', 'csw') GROUP BY ogc_service ORDER BY ogc_service;"
+sql_select_content_size =            "SELECT SUM(org_content_size) AS size FROM {table};"
+sql_select_content_size_by_service = "SELECT ogc_service AS service, SUM(org_content_size) AS size FROM {table} WHERE dge_application IN ('geoserver', 'geonetwork') and ogc_service IN ('wms', 'wfs', 'wmts', 'csw') GROUP BY ogc_service ORDER BY ogc_service;"
+
+
+def read_tables(db_file):
+
+    result = {}
+
+    with sqlite3.connect(DB_FILE) as conn:
+        c = conn.cursor()
+        
+        for table in TABLES:
+            print('Table:', table)
+            start_table = datetime.datetime.now()
+            
+            tk_date = table.replace('LOGS_', '')
+            c.execute(sql_create_service_index.format(tk_date=tk_date, table=table))
+            c.execute(sql_create_application_index.format(tk_date=tk_date, table=table))
+            
+            result[table] = {}
+            
+            c.execute(sql_select_nb_lines.format(table=table))
+            result[table]['nb_lines'] = c.fetchone()[0]
+            
+            c.execute(sql_select_nb_requests_by_service.format(table=table))
+            for row in c.fetchall():
+                result[table]['nb_' + row[0]] = row[1]
+            
+            c.execute(sql_select_nb_requests.format(table=table))
+            result[table]['nb_all'] = c.fetchone()[0]
+            
+            c.execute(sql_select_content_size_by_service.format(table=table))
+            for row in c.fetchall():
+                result[table]['size_' + row[0]] = row[1]
+
+            c.execute(sql_select_content_size.format(table=table))
+            result[table]['size_all'] = c.fetchone()[0]
+            
+            end_table = datetime.datetime.now()
+            print("Duration:", str(end_table - start_table).split('.')[0])
+            
+    return result
+
+
+def print_result(result):
+    print('-'*80)
+    lines = []
+    first = True
+    for table in result:
+        if first:
+            line = ['TABLE']
+            line.extend([str(key).upper() for key in result[table]])
+            lines.append(','.join(line))
+            first = False
+        line = [table]
+        line.extend([str(result[table][key]) for key in result[table]])
+        lines.append(','.join(line))
+    print('\n'.join(lines))
+    print('-'*80)
+
+
+if __name__ == "__main__":
+    print('=' * 80)
+    start = datetime.datetime.now()
+    print("Start:", start.strftime("%H:%M:%S"))
+
+    result = read_tables(DB_FILE)
+    print_result(result)
+
+    end = datetime.datetime.now()
+    print("End:", end.strftime("%H:%M:%S"))
+    print("Total duration:", str(end - start).split('.')[0])
+    print('=' * 80)
--- a/sqlite_to_parquet.py
+++ b/sqlite_to_parquet.py
@ -0,0 +1,41 @@
+import os
+
+import duckdb
+
+
+duckdb.sql('''
+    ATTACH './db/dge_traefik_logs.db3' AS logs;
+    USE logs;
+''')
+
+r = duckdb.sql('''
+    SHOW TABLES;
+''')
+
+print(r)
+
+tables = duckdb.sql('''
+    SHOW TABLES;
+''').pl().get_column('name').to_list()
+
+# print(tables[:10])
+
+for table in tables:
+    
+    print(table)
+    
+    if not os.path.isfile(table + '.parquet'):
+        duckdb.sql(
+            f'''
+                COPY (SELECT * FROM logs.{table}) 
+                TO '{table}.parquet' (FORMAT 'parquet') 
+            '''
+        )
+
+    nb = duckdb.sql(
+        f'''
+            FROM '{table}.parquet'
+            SELECT COUNT(*)
+        ''')
+
+    print(nb)