first commit 20240311

This commit is contained in:
Guillaume RYCKELYNCK 2024-03-11 08:36:55 +01:00
commit 9b41e2350f
11 changed files with 947 additions and 0 deletions

5
.gitignore vendored Normal file
View File

@ -0,0 +1,5 @@
python
data/
db/
parquet/
results/

86
README.md Normal file
View File

@ -0,0 +1,86 @@
# DGE Traefik logs
L'objectif de ce projet est (1) de pouvoir convertir les fichiers traekik au format JSON dans un format plus adapté à leur explopitation et (2) permettre le requêtage des données pour en extraire des statistiques et indicateurs sur l'utilisation de la plateforme DataGrandEst.
Dans la suite, 2 approches ont été étudiées
- Utilisation d'une BDD SQLite
- Utilisation de fichiers Parquet
A noter que l'ensemble des fichiers source représente environ 113 Go de données.
## BDD SQLITE
C'est une approche relativement conventionnelle qui permet une mise en oeuvre assez simple.
Approche retenue:
- Créer une base de données sqlite3
- Lire un fichier mensuel et l'intégrer dans la BDD (en l'adaptant au besoin)
- Traiter les données en SQL
Fonctionne sur la base de 2 scripts:
1. `sqlite_import.py` = importer les logs en BDD SQLite
2. `sqlite_stats.py` = calculer les principales stats mensuelles
### Import
cf. `sqlite_import.py`
Chaque fichier de log est importé dans une table mensuelle spécifique, ligne par ligne.
Ex.: le fichier "2023-07" est importé dans le table "LOGS_2023_07" et ainsi de suite.
En parallèle, les requêtes WMS sont importées dans une table dédiée "LAYERS_2023_07" (sur la base de l'exemple précédent). L'objectif est de pouvoir à terme disposer de statistiques sur les couches les plus consultées.
Le même travail reste à faire pour les couches WFS et WMTS.
L'import de l'ensemble des fichiers dure environ 1 heure. Le fichier final fait environ 56Go.
``` txt
2023-07 - 8.26 GB - 0:05:24
2023-08 - 16.29 GB - 0:09:42
2023-09 - 20.12 GB - 0:11:45
2023-10 - 23.26 GB - 0:12:44
2023-11 - 25.97 GB - 0:13:45
2023-12 - 19.7 GB - 0:10:17
```
### Stats
cf. `sqlite_stats.py`
Ce script calcule des stats sur la base de requêtes de type SELECT sur la BDD.
Le résultat est présenté sous la forme d'un fichier CSV. Un copier/coller permet de l'intégrer dans un tableau Excel.
La calcul des dure environ 5 à 7 minutes.
## Fomrat PARQUET
La création d'un fichier parquet n'est pas évidente.
### SQLITE to PARQUET
cf. `sqlite_to_parquet.py`
La première tentative a conduit à créer un fichier parquet à partir du fichier SQLite (cf. `sqlite_to_parquet.py`).
Cela reste assez simple, mais nécessite de passer par une étape initiale assez longue de génération du fichier SQLITE (cf. supra). On obtient alors un fichier parquet par table de la base de données.
L'exploitation se fait via `parquet_sqlite_stats.py`.
### JSON to PARQUET
cf. `parquet_import.py`
Un autre test a été réalisé en intégrant les logs dans le fichier parquet sur le modèle de ce qui a été fait pour la BDD SQLite (cf. json_to_parquet.py).
L'opération est relativement longue mais présente 2 avantages:
- On obtient au final 2 fichiers parquets "logs.parquet" et "layers.parquet" avec l'ensemble des données
- La taille des fichiers est réduite : 8 Go au total pour les 2 fichiers alors que la base SQLite fait 56 Go et les données sources en JSON 113 Go.
La durée de convertion est d'environ 1h30.
L'exploitation des données est réalisée avec le fichier "parquet_stats.py".
cf. `parquet_stats.py`
## TODO
- [ ] Intégrer dans "layers.parquet" les requêtes provenant de flux WFS.
- [ ] Donner la possibilité de générer le fichier "layers.parquet" à parir de "logs.parquet" pour faciliter la mise à jour.

180
parquet_import.py Normal file
View File

@ -0,0 +1,180 @@
import json
import re
import os
import copy
import datetime
import hashlib
import duckdb
import pandas as pd
import fastparquet as fp
tk_files = ['./data/2023-07', './data/2023-08', './data/2023-09', './data/2023-10', './data/2023-11', './data/2023-12']
pq_file_logs = './parquet/logs.parquet'
pq_file_layers = './parquet/layers.parquet'
limit = False
group = 1000000
APPLICATION_FROM_PATH = {
'/cas': 'cas',
'/console': 'console',
'/login': 'login',
'/cadastrapp': 'cadastrapp',
'/geoserver': 'geoserver',
'/metadata': 'metadata',
'/files': 'files',
'/geonetwork': 'geonetwork',
'/tools/mdedit': 'mdedit',
'/tools/mviewer': 'mviewer',
'/mviewer': 'mviewer',
'/mapstore': 'mapstore',
'/geocontrib': 'geocontrib',
'/data4citizen': 'data4citizen',
'/portail': 'cms',
'/': 'root',
'/robots': 'robots',
}
def write_parquet(data, pq_file):
df_logs = pd.DataFrame(data)
if not os.path.isfile(pq_file):
fp.write(pq_file, df_logs, compression='GZIP')
else:
fp.write(pq_file, df_logs, compression='GZIP', append=True)
def convert_file(tk_file, pq_file_logs, pq_file_layers, limit=False):
# convert TK file to JSON file
with open(tk_file, 'r', encoding="utf-8") as tkf:
num_line = 0
logs = []
layers = []
# df_logs = None
# df_layers = None
print('TK file:', tk_file)
print('0')
while (num_line < limit or not limit):
line = tkf.readline()
if not line:
write_parquet(logs, pq_file_logs)
write_parquet(layers, pq_file_layers)
break
else:
hash = hashlib.sha256(line.encode('utf-8')).hexdigest()
line_json = json.loads(line)
time = datetime.datetime.fromisoformat(line_json['StartUTC'])
dge_application = 'other'
for application in APPLICATION_FROM_PATH:
if line_json['RequestPath'].startswith(application):
dge_application = APPLICATION_FROM_PATH[application]
break
ogc_service = re.findall("[?|&]service=([a-z]*)[?|&]?", line_json['RequestPath'], re.IGNORECASE)
ogc_service = ogc_service[0].lower() if ogc_service else ''
ogc_request = re.findall("[?|&]request=([a-z]*)[?|&]?", line_json['RequestPath'], re.IGNORECASE)
ogc_request = ogc_request[0].lower() if len(ogc_request) else ''
ogc_version = re.findall("[?|&]version=([0-9.]*)[?|&]?", line_json['RequestPath'], re.IGNORECASE)
ogc_version = ogc_version[0].lower() if len(ogc_version) else ''
ogc_crs = re.findall("[?|&]crs=([a-z0-9.:]*)[?|&]?", line_json['RequestPath'], re.IGNORECASE)
ogc_crs = ogc_crs[0].lower() if ogc_crs else ''
ogc_bbox = re.findall("[?|&]bbox=([a-z0-9.:;,]*)[?|&]?", line_json['RequestPath'], re.IGNORECASE)
ogc_bbox = ogc_bbox[0] if ogc_bbox else ''
ogc_workspace = re.findall("/geoserver/([a-z0-9_.]*)/[a-z]", line_json['RequestPath'], re.IGNORECASE)
ogc_workspace = ogc_workspace[0].lower() if len(ogc_workspace) else ''
ogc_layers = re.findall("[?|&]layers=([a-z0-9_.:,-]*)[?|&|$]", line_json['RequestPath'], re.IGNORECASE)
ogc_layers = ogc_layers[0] if len(ogc_layers) else ''
if ogc_layers:
for ogc_layer in ogc_layers.split(','):
layer_parts = ogc_layer.split(':')
if len(layer_parts) == 2:
layer = layer_parts[1]
workspace = layer_parts[0]
else:
layer = layer_parts[0]
workspace = ogc_workspace
layers.append({
'log_hash': hash,
'timestamp': int(time.timestamp()),
'version': copy.deepcopy(ogc_version),
'application': copy.deepcopy(dge_application),
'service': copy.deepcopy(ogc_service),
'request': copy.deepcopy(ogc_request),
'workspace': copy.deepcopy(workspace),
'layer': copy.deepcopy(layer),
'crs': copy.deepcopy(ogc_crs),
'bbox': copy.deepcopy(ogc_bbox),
})
log = {
'hash': hash,
'timestamp': int(time.timestamp()),
'year': time.year,
'month': time.month,
'day': time.day,
'hour': time.hour,
'minute': time.minute,
'second': time.second,
'microsecond': time.microsecond,
'org_content_size': line_json['OriginContentSize'],
'req_address': line_json['RequestAddr'],
'req_content_size': line_json['RequestContentSize'],
'req_count': line_json['RequestCount'],
'req_host': line_json['RequestHost'],
'req_method': line_json['RequestMethod'],
'req_path': line_json['RequestPath'],
'req_port': line_json['RequestPort'],
'req_protocol': line_json['RequestProtocol'],
'req_scheme': line_json['RequestScheme'],
'dge_application': dge_application,
'ogc_service': ogc_service,
'ogc_version': ogc_version,
'ogc_request': ogc_request,
'ogc_workspace': ogc_workspace,
'ogc_layers': ogc_layers,
'ogc_crs': ogc_crs,
'ogc_bbox': ogc_bbox,
}
logs.append(log)
num_line += 1
if num_line % group == 0:
print(num_line)
write_parquet(logs, pq_file_logs)
logs = []
write_parquet(layers, pq_file_layers)
layers = []
if __name__ == "__main__":
print('=' * 80)
start = datetime.datetime.now()
print("Start:", start.strftime("%H:%M:%S"))
for tk_file in tk_files:
start_table = datetime.datetime.now()
print('-' * 80)
if os.path.isfile(tk_file):
convert_file(tk_file, pq_file_logs, pq_file_layers, limit=limit)
else:
print(tk_file, 'not exists.')
end_table = datetime.datetime.now()
print("Duration:", str(end_table - start_table).split('.')[0])
end = datetime.datetime.now()
print()
print("End:", end.strftime("%H:%M:%S"))
print("Total duration:", str(end - start).split('.')[0])
print('=' * 80)

73
parquet_sqlite_stats.py Normal file
View File

@ -0,0 +1,73 @@
import datetime
import duckdb
TABLES = ['LOGS_2023_07', 'LOGS_2023_08', 'LOGS_2023_09', 'LOGS_2023_10', 'LOGS_2023_11', 'LOGS_2023_12']
sql_select_nb_lines = "SELECT COUNT(id) AS nb_lines FROM read_parquet('{table}');"
sql_select_nb_requests = "SELECT COUNT(id) AS nb_requests FROM read_parquet('{table}');"
sql_select_nb_requests_by_service = "SELECT ogc_service AS service, COUNT(id) AS requests FROM read_parquet('{table}') WHERE dge_application IN ('geoserver', 'geonetwork') and ogc_service IN ('wms', 'wfs', 'wmts', 'csw') GROUP BY ogc_service ORDER BY ogc_service;"
sql_select_content_size = "SELECT SUM(CAST(org_content_size as INTEGER)) AS size FROM read_parquet('{table}');"
sql_select_content_size_by_service = "SELECT ogc_service AS service, SUM(CAST(org_content_size as INTEGER)) AS size FROM read_parquet('{table}') WHERE dge_application IN ('geoserver', 'geonetwork') and ogc_service IN ('wms', 'wfs', 'wmts', 'csw') GROUP BY ogc_service ORDER BY ogc_service;"
def read_tables(tables):
result = {}
for table in tables:
pq_file = './parquet/' + table + '.parquet'
result[table] = {}
result[table]['nb_lines'] = duckdb.sql(sql_select_nb_lines.format(table=pq_file)).fetchone()[0]
result[table]['nb_requests_all'] = duckdb.sql(sql_select_nb_requests.format(table=pq_file)).fetchone()[0]
nb_requests_by_service = duckdb.sql(sql_select_nb_requests_by_service.format(table=pq_file)).fetchall()
result[table]['nb_requests_other'] = 0
result[table]['nb_requests_services'] = 0
for row in nb_requests_by_service:
result[table]['nb_' + row[0]] = row[1]
result[table]['nb_requests_services'] += row[1]
result[table]['size_all'] = duckdb.sql(sql_select_content_size.format(table=pq_file)).fetchone()[0]
size_by_service = duckdb.sql(sql_select_content_size_by_service.format(table=pq_file)).fetchall()
result[table]['size_other'] = 0
result[table]['size_services'] = 0
for row in size_by_service:
result[table]['size_' + row[0]] = row[1]
result[table]['size_services'] += row[1]
result[table]['nb_requests_other'] = result[table]['nb_requests_all'] - result[table]['nb_requests_services']
result[table]['size_other'] = result[table]['size_all'] - result[table]['size_services']
return result
def print_result(result):
print('-'*80)
lines = []
first = True
for table in result:
if first:
line = ['TABLE']
line.extend([str(key).upper() for key in result[table]])
lines.append(','.join(line))
first = False
line = [table]
line.extend([str(result[table][key]) for key in result[table]])
lines.append(','.join(line))
print('\n'.join(lines))
print('-'*80)
if __name__ == "__main__":
print('=' * 80)
start = datetime.datetime.now()
print("Start:", start.strftime("%H:%M:%S"))
result = read_tables(tables=TABLES)
print_result(result)
end = datetime.datetime.now()
print("End:", end.strftime("%H:%M:%S"))
print("Total duration:", str(end - start).split('.')[0])
print('=' * 80)

72
parquet_stats.py Normal file
View File

@ -0,0 +1,72 @@
import datetime
import duckdb
pq_file = './parquet/logs.parquet'
sql_select_by_month = "SELECT year AS year, month AS month, COUNT(*) AS nb_lines, SUM(CAST(org_content_size as INTEGER)) AS size FROM read_parquet('{table}') GROUP BY year, month;"
sql_select_requests_by_service = "SELECT year AS year, month, ogc_service AS service, COUNT(*) AS requests FROM read_parquet('{table}') WHERE dge_application IN ('geoserver', 'geonetwork') and ogc_service IN ('wms', 'wfs', 'wmts', 'csw') GROUP BY year, month, ogc_service ORDER BY year, month, ogc_service;"
sql_select_size_by_service = "SELECT year AS year, month, ogc_service AS service, SUM(CAST(org_content_size as INTEGER)) AS size FROM read_parquet('{table}') WHERE dge_application IN ('geoserver', 'geonetwork') and ogc_service IN ('wms', 'wfs', 'wmts', 'csw') GROUP BY year, month, ogc_service ORDER BY year, month, ogc_service;"
def get_data(pq_file):
df_total = duckdb.sql(sql_select_by_month.format(table=pq_file)).df()
df_requests_by_service = duckdb.sql(sql_select_requests_by_service.format(table=pq_file)).df()
df_size_by_service = duckdb.sql(sql_select_size_by_service.format(table=pq_file)).df()
result= {}
for index, row in df_total.iterrows():
month = str(int(row['year'])) + '_' + str(int(row['month'])).rjust(2, '0')
result[month] = {
'nb_all': row['nb_lines'],
'size_all': row['size'],
}
for index, row in df_requests_by_service.iterrows():
month = str(int(row['year'])) + '_' + str(int(row['month'])).rjust(2, '0')
result[month]['nb_services'] = 0 if 'nb_services' not in result[month] else result[month]['nb_services'] + row['requests']
result[month]['nb_' + row['service']] = row['requests']
for index, row in df_size_by_service.iterrows():
month = str(int(row['year'])) + '_' + str(int(row['month'])).rjust(2, '0')
result[month]['size_services'] = 0 if 'size_services' not in result[month] else result[month]['size_services'] + row['size']
result[month]['size_' + row['service']] = row['size']
for row in result:
result[row]['nb_other'] = result[row]['nb_all'] - result[row]['nb_services']
result[row]['size_other'] = result[row]['size_all'] - result[row]['size_services']
return result
def print_result(result):
print('-'*80)
lines = []
first = True
for month in result:
if first:
line = ['MONTH']
line.extend([str(key).upper() for key in result[month]])
lines.append(','.join(line))
first = False
line = [month]
line.extend([str(int(result[month][key])) for key in result[month]])
lines.append(','.join(line))
print('\n'.join(lines))
print('-'*80)
if __name__ == "__main__":
print('=' * 80)
start = datetime.datetime.now()
print("Start:", start.strftime("%H:%M:%S"))
result = get_data(pq_file)
print_result(result)
end = datetime.datetime.now()
print("End:", end.strftime("%H:%M:%S"))
print("Total duration:", str(end - start).split('.')[0])
print('=' * 80)

36
sql/create_indexes.sql Normal file
View File

@ -0,0 +1,36 @@
CREATE UNIQUE INDEX "ix_logs_2023_07_id" ON "LOGS_2023_07" (
"id"
);
CREATE INDEX "ix_logs_2023_07_service" ON "LOGS_2023_07" (
"ogc_service"
);
CREATE UNIQUE INDEX "ix_logs_2023_08_id" ON "LOGS_2023_08" (
"id"
);
CREATE INDEX "ix_logs_2023_08_service" ON "LOGS_2023_08" (
"ogc_service"
);
CREATE UNIQUE INDEX "ix_logs_2023_09_id" ON "LOGS_2023_09" (
"id"
);
CREATE INDEX "ix_logs_2023_09_service" ON "LOGS_2023_09" (
"ogc_service"
);
CREATE UNIQUE INDEX "ix_logs_2023_10_id" ON "LOGS_2023_10" (
"id"
);
CREATE INDEX "ix_logs_2023_10_service" ON "LOGS_2023_10" (
"ogc_service"
);
CREATE UNIQUE INDEX "ix_logs_2023_11_id" ON "LOGS_2023_11" (
"id"
);
CREATE INDEX "ix_logs_2023_11_service" ON "LOGS_2023_11" (
"ogc_service"
);
CREATE UNIQUE INDEX "ix_logs_2023_12_id" ON "LOGS_2023_12" (
"id"
);
CREATE INDEX "ix_logs_2023_12_service" ON "LOGS_2023_12" (
"ogc_service"
);

55
sql/dge_traefik_logs.sql Normal file
View File

@ -0,0 +1,55 @@
DROP TABLE IF EXISTS "main"."{table}";
CREATE TABLE IF NOT EXISTS "main"."{table}" (
"id" INTEGER NOT NULL UNIQUE,
"hash" TEXT,
"timestamp" INTEGER,
"year" INTEGER,
"month" INTEGER,
"day" INTEGER,
"hour" INTEGER,
"minute" INTEGER,
"second" INTEGER,
"microsecond" INTEGER,
"org_content_size" TEXT,
"req_address" TEXT,
"req_content_size" INTEGER,
"req_count" INTEGER,
"req_host" TEXT,
"req_method" TEXT,
"req_path" TEXT,
"req_port" TEXT,
"req_protocol" TEXT,
"req_scheme" TEXT,
"ogc_service" TEXT,
"ogc_workspace" TEXT,
"ogc_layers" TEXT,
"ogc_request" TEXT,
"ogc_epsg" TEXT,
PRIMARY KEY("id" AUTOINCREMENT)
);
INSERT INTO "main"."{table}"
("hash", "timestamp", "year", "month", "day", "hour", "minute", "microsecond", "org_content_size", "req_address", "req_content_size", "req_count", "req_host", "req_method", "req_path", "req_port", "req_protocol", "req_scheme", "ogc_service", "ogc_workspace", "ogc_layers", "ogc_request", "ogc_epsg")
VALUES
(:hash, :timestamp, :year, :month, :day, :hour, :minute, :microsecond, :org_content_size, :req_address, :req_content_size, :req_count, :req_host, :req_method, :req_path, :req_port, :req_protocol, :req_scheme, :ogc_service, :ogc_workspace, :ogc_layers, :ogc_request, :ogc_epsg);
SELECT id, hash, timestamp, year, month, day, hour, minute, microsecond, org_content_size, req_address, req_content_size, req_count, req_host, req_method, req_path, req_port, req_protocol, req_scheme, ogc_service, ogc_workspace, ogc_layers, ogc_request, ogc_epsg FROM {table};
SELECT id, hash, timestamp, year, month, day, hour, minute, microsecond, org_content_size, req_address, req_content_size, req_count, req_host, req_method, req_path, req_port, req_protocol, req_scheme, ogc_service, ogc_workspace, ogc_layers, ogc_request, ogc_epsg FROM {table} WHERE service <> '';
SELECT id, hash, timestamp, year, month, day, hour, minute, microsecond, org_content_size, req_address, req_content_size, req_count, req_host, req_method, req_path, req_port, req_protocol, req_scheme, ogc_service, ogc_workspace, ogc_layers, ogc_request, ogc_epsg FROM {table} WHERE service = '';
SELECT id, hash, timestamp, year, month, day, hour, minute, microsecond, org_content_size, req_address, req_content_size, req_count, req_host, req_method, req_path, req_port, req_protocol, req_scheme, ogc_service, ogc_workspace, ogc_layers, ogc_request, ogc_epsg FROM {table} WHERE request = 'getmap';
CREATE TABLE IF NOT EXISTS "main"."{table}" (
"id" INTEGER NOT NULL UNIQUE,
"id_logs" INTEGER,
"service" TEXT,
"workspace" TEXT,
"layer" TEXT,
"request" TEXT,
"epsg" TEXT,
PRIMARY KEY("id" AUTOINCREMENT)
);

26
sql/update_to_lower.sql Normal file
View File

@ -0,0 +1,26 @@
UPDATE LOGS_2023_07
SET ogc_service = LOWER(ogc_service), ogc_request = LOWER(ogc_request), ogc_crs = LOWER(ogc_crs), ogc_workspace = LOWER(ogc_workspace);
UPDATE LOGS_2023_08
SET ogc_service = LOWER(ogc_service), ogc_request = LOWER(ogc_request), ogc_crs = LOWER(ogc_crs), ogc_workspace = LOWER(ogc_workspace);
UPDATE LOGS_2023_09
SET ogc_service = LOWER(ogc_service), ogc_request = LOWER(ogc_request), ogc_crs = LOWER(ogc_crs), ogc_workspace = LOWER(ogc_workspace);
UPDATE LOGS_2023_10
SET ogc_service = LOWER(ogc_service), ogc_request = LOWER(ogc_request), ogc_crs = LOWER(ogc_crs), ogc_workspace = LOWER(ogc_workspace);
UPDATE LOGS_2023_11
SET ogc_service = LOWER(ogc_service), ogc_request = LOWER(ogc_request), ogc_crs = LOWER(ogc_crs), ogc_workspace = LOWER(ogc_workspace);
UPDATE LOGS_2023_12
SET ogc_service = LOWER(ogc_service), ogc_request = LOWER(ogc_request), ogc_crs = LOWER(ogc_crs), ogc_workspace = LOWER(ogc_workspace);
UPDATE LOGS_2023_12
SET ogc_service = LOWER(ogc_service), ogc_request = LOWER(ogc_request), ogc_crs = LOWER(ogc_crs), ogc_workspace = LOWER(ogc_workspace);
UPDATE LAYERS_2023_07
SET service = LOWER(service), request = LOWER(request), crs = LOWER(crs), workspace = LOWER(workspace);
UPDATE LAYERS_2023_08
SET service = LOWER(service), request = LOWER(request), crs = LOWER(crs), workspace = LOWER(workspace);
UPDATE LAYERS_2023_09
SET service = LOWER(service), request = LOWER(request), crs = LOWER(crs), workspace = LOWER(workspace);
UPDATE LAYERS_2023_10
SET service = LOWER(service), request = LOWER(request), crs = LOWER(crs), workspace = LOWER(workspace);
UPDATE LAYERS_2023_11
SET service = LOWER(service), request = LOWER(request), crs = LOWER(crs), workspace = LOWER(workspace);
UPDATE LAYERS_2023_12
SET service = LOWER(service), request = LOWER(request), crs = LOWER(crs), workspace = LOWER(workspace);

273
sqlite_import.py Normal file
View File

@ -0,0 +1,273 @@
import os
import math
import json
import datetime
import sqlite3
import hashlib
import re
import copy
DB_FILE = './db/dge_traefik_logs.db3'
DATA_DIRECTORY = './data/'
TK_DATES = ['2023-07', '2023-08', '2023-09', '2023-10', '2023-11', '2023-12']
LIMIT = False
APPLICATION_FROM_PATH = {
'/cas': 'cas',
'/console': 'console',
'/login': 'login',
'/cadastrapp': 'cadastrapp',
'/geoserver': 'geoserver',
'/metadata': 'metadata',
'/files': 'files',
'/geonetwork': 'geonetwork',
'/tools/mdedit': 'mdedit',
'/tools/mviewer': 'mviewer',
'/mviewer': 'mviewer',
'/mapstore': 'mapstore',
'/geocontrib': 'geocontrib',
'/data4citizen': 'data4citizen',
'/portail': 'cms',
'/': 'root',
'/robots': 'robots',
}
sql_drop_table = '''DROP TABLE IF EXISTS "main"."{table}";'''
sql_create_table_logs = '''
CREATE TABLE IF NOT EXISTS "main"."{table}" (
"id" INTEGER NOT NULL UNIQUE,
"hash" TEXT,
"timestamp" INTEGER,
"year" INTEGER,
"month" INTEGER,
"day" INTEGER,
"hour" INTEGER,
"minute" INTEGER,
"second" INTEGER,
"microsecond" INTEGER,
"org_content_size" TEXT,
"req_address" TEXT,
"req_content_size" INTEGER,
"req_count" INTEGER,
"req_host" TEXT,
"req_method" TEXT,
"req_path" TEXT,
"req_port" TEXT,
"req_protocol" TEXT,
"req_scheme" TEXT,
"dge_application" TEXT,
"ogc_service" TEXT,
"ogc_version" TEXT,
"ogc_workspace" TEXT,
"ogc_layers" TEXT,
"ogc_request" TEXT,
"ogc_crs" TEXT,
"ogc_bbox" TEXT,
PRIMARY KEY("id" AUTOINCREMENT)
);
'''
sql_insert_logs = '''
INSERT INTO "main"."{table}"
("hash", "timestamp", "year", "month", "day", "hour", "minute", "microsecond", "org_content_size", "req_address", "req_content_size", "req_count", "req_host", "req_method", "req_path", "req_port", "req_protocol", "req_scheme", "dge_application", "ogc_service", "ogc_version", "ogc_workspace", "ogc_layers", "ogc_request", "ogc_crs", "ogc_bbox")
VALUES
(:hash, :timestamp, :year, :month, :day, :hour, :minute, :microsecond, :org_content_size, :req_address, :req_content_size, :req_count, :req_host, :req_method, :req_path, :req_port, :req_protocol, :req_scheme, :dge_application, :ogc_service, :ogc_version, :ogc_workspace, :ogc_layers, :ogc_request, :ogc_crs, :ogc_bbox);
'''
sql_create_table_layers = '''
CREATE TABLE IF NOT EXISTS "main"."{table}" (
"id" INTEGER NOT NULL UNIQUE,
"id_logs" INTEGER,
"service" TEXT,
"version" TEXT,
"workspace" TEXT,
"layer" TEXT,
"request" TEXT,
"crs" TEXT,
"bbox" TEXT,
PRIMARY KEY("id" AUTOINCREMENT)
);
'''
sql_insert_layers = '''
INSERT INTO "main"."{table}"
("id_logs", "service", "version", "workspace", "layer", "request", "crs", "bbox")
VALUES
(:id_logs, :service, :version, :workspace, :layer, :request, :crs, :bbox);
'''
sql_create_id_index = '''
CREATE UNIQUE INDEX "ix_logs_{tk_date}_id" ON "{table}" (
"id"
);
'''
sql_create_service_index = '''
CREATE INDEX "ix_logs_{tk_date}_service" ON "{table}" (
"ogc_service"
);
'''
sql_create_application_index = '''
CREATE INDEX "ix_logs_{tk_date}_application" ON "{table}" (
"dge_application"
);
'''
def convert_size(size_bytes):
if size_bytes == 0:
return "0B"
size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
i = int(math.floor(math.log(size_bytes, 1024)))
p = math.pow(1024, i)
s = round(size_bytes / p, 2)
return "%s %s" % (s, size_name[i])
def import_tk_file(db_file, tk_file, limit=False):
tk_file_size = os.path.getsize(tk_file)
print(tk_file, '-', convert_size(tk_file_size))
with sqlite3.connect(DB_FILE) as conn:
c = conn.cursor()
tk_date = os.path.basename(tk_file).replace('-', '_')
tables = [
'LOGS_' + tk_date,
'LAYERS_' + tk_date
]
# Drop tables
for table in tables:
c.execute(sql_drop_table.format(table=table))
# Create tables
c.execute(sql_create_table_logs.format(table=tables[0]))
c.execute(sql_create_table_layers.format(table=tables[1]))
c.execute(sql_create_service_index.format(tk_date=tk_date, table=tables[0]))
c.execute(sql_create_application_index.format(tk_date=tk_date, table=tables[0]))
conn.commit()
with open(tk_file, 'r', encoding="utf-8") as tkf:
num_line = 0
while (num_line < limit or not limit):
line = tkf.readline()
if not line:
break
else:
hash = hashlib.sha256(line.encode('utf-8')).hexdigest()
line_json = json.loads(line)
time = datetime.datetime.fromisoformat(line_json['StartUTC'])
dge_application = 'other'
for application in APPLICATION_FROM_PATH:
if line_json['RequestPath'].startswith(application):
dge_application = APPLICATION_FROM_PATH[application]
break
ogc_service = re.findall("[?|&]service=([a-z]*)[?|&]?", line_json['RequestPath'], re.IGNORECASE)
ogc_service = ogc_service[0].lower() if ogc_service else ''
ogc_request = re.findall("[?|&]request=([a-z]*)[?|&]?", line_json['RequestPath'], re.IGNORECASE)
ogc_request = ogc_request[0].lower() if len(ogc_request) else ''
ogc_version = re.findall("[?|&]version=([0-9.]*)[?|&]?", line_json['RequestPath'], re.IGNORECASE)
ogc_version = ogc_version[0].lower() if len(ogc_version) else ''
ogc_crs = re.findall("[?|&]crs=([a-z0-9.:]*)[?|&]?", line_json['RequestPath'], re.IGNORECASE)
ogc_crs = ogc_crs[0].lower() if ogc_crs else ''
ogc_bbox = re.findall("[?|&]bbox=([a-z0-9.:;,]*)[?|&]?", line_json['RequestPath'], re.IGNORECASE)
ogc_bbox = ogc_bbox[0] if ogc_bbox else ''
ogc_workspace = re.findall("/geoserver/([a-z0-9_.]*)/[a-z]", line_json['RequestPath'], re.IGNORECASE)
ogc_workspace = ogc_workspace[0].lower() if len(ogc_workspace) else ''
ogc_layers = re.findall("[?|&]layers=([a-z0-9_.:,-]*)[?|&|$]", line_json['RequestPath'], re.IGNORECASE)
ogc_layers = ogc_layers[0] if len(ogc_layers) else ''
layers = []
if ogc_layers:
for ogc_layer in ogc_layers.split(','):
layer_parts = ogc_layer.split(':')
if len(layer_parts) == 2:
layer = layer_parts[1]
workspace = layer_parts[0]
else:
layer = layer_parts[0]
workspace = ogc_workspace
layers.append({
'version': copy.deepcopy(ogc_version),
'service': copy.deepcopy(ogc_service),
'request': copy.deepcopy(ogc_request),
'workspace': copy.deepcopy(workspace),
'layer': copy.deepcopy(layer),
'crs': copy.deepcopy(ogc_crs),
'bbox': copy.deepcopy(ogc_bbox),
})
data = {
'hash': hash,
'timestamp': int(time.timestamp()),
'year': time.year,
'month': time.month,
'day': time.day,
'hour': time.hour,
'minute': time.minute,
'second': time.second,
'microsecond': time.microsecond,
'org_content_size': line_json['OriginContentSize'],
'req_address': line_json['RequestAddr'],
'req_content_size': line_json['RequestContentSize'],
'req_count': line_json['RequestCount'],
'req_host': line_json['RequestHost'],
'req_method': line_json['RequestMethod'],
'req_path': line_json['RequestPath'],
'req_port': line_json['RequestPort'],
'req_protocol': line_json['RequestProtocol'],
'req_scheme': line_json['RequestScheme'],
'dge_application': dge_application,
'ogc_service': ogc_service,
'ogc_version': ogc_version,
'ogc_request': ogc_request,
'ogc_workspace': ogc_workspace,
'ogc_layers': ogc_layers,
'ogc_crs': ogc_crs,
'ogc_bbox': ogc_bbox,
}
# Insert log line
c.execute(sql_insert_logs.format(table=tables[0]), data)
id_logs = c.lastrowid
# Insert layers of log
for layer in layers:
layer['id_logs'] = id_logs
c.execute(sql_insert_layers.format(table=tables[1]), layer)
num_line += 1
conn.commit()
print("Nb lines:", num_line)
now = datetime.datetime.now()
print("Duration:", str(now - start).split('.')[0])
if __name__ == "__main__":
print('=' * 80)
start = datetime.datetime.now()
print("Start:", start.strftime("%H:%M:%S"))
for tk_date in TK_DATES:
start_table = datetime.datetime.now()
print('-' * 80)
tk_file = DATA_DIRECTORY + tk_date
if os.path.isfile(tk_file):
import_tk_file(db_file=DB_FILE, tk_file=tk_file, limit=LIMIT)
else:
print(tk_file, 'not exists.')
end_table = datetime.datetime.now()
print("Duration:", str(end_table - start_table).split('.')[0])
print('-' * 80)
end = datetime.datetime.now()
print("End:", end.strftime("%H:%M:%S"))
print("Total duration:", str(end - start).split('.')[0])
print('=' * 80)

100
sqlite_stats.py Normal file
View File

@ -0,0 +1,100 @@
import sqlite3
import datetime
DB_FILE = './dge_traefik_logs.db3'
TABLES = ['LOGS_2023_07', 'LOGS_2023_08', 'LOGS_2023_09', 'LOGS_2023_10', 'LOGS_2023_11', 'LOGS_2023_12']
sql_create_id_index = '''
CREATE UNIQUE INDEX IF NOT EXISTS "ix_logs_{tk_date}_id" ON "{table}" (
"id"
);
'''
sql_create_service_index = '''
CREATE INDEX IF NOT EXISTS "ix_logs_{tk_date}_service" ON "{table}" (
"ogc_service"
);
'''
sql_create_application_index = '''
CREATE INDEX IF NOT EXISTS "ix_logs_{tk_date}_application" ON "{table}" (
"dge_application"
);
'''
sql_select_nb_lines = "SELECT COUNT(id) AS nb_lines FROM {table};"
sql_select_nb_requests = "SELECT COUNT(id) AS nb_requests FROM {table};"
sql_select_nb_requests_by_service = "SELECT ogc_service AS service, COUNT(id) AS requests FROM {table} WHERE dge_application IN ('geoserver', 'geonetwork') and ogc_service IN ('wms', 'wfs', 'wmts', 'csw') GROUP BY ogc_service ORDER BY ogc_service;"
sql_select_content_size = "SELECT SUM(org_content_size) AS size FROM {table};"
sql_select_content_size_by_service = "SELECT ogc_service AS service, SUM(org_content_size) AS size FROM {table} WHERE dge_application IN ('geoserver', 'geonetwork') and ogc_service IN ('wms', 'wfs', 'wmts', 'csw') GROUP BY ogc_service ORDER BY ogc_service;"
def read_tables(db_file):
result = {}
with sqlite3.connect(DB_FILE) as conn:
c = conn.cursor()
for table in TABLES:
print('Table:', table)
start_table = datetime.datetime.now()
tk_date = table.replace('LOGS_', '')
c.execute(sql_create_service_index.format(tk_date=tk_date, table=table))
c.execute(sql_create_application_index.format(tk_date=tk_date, table=table))
result[table] = {}
c.execute(sql_select_nb_lines.format(table=table))
result[table]['nb_lines'] = c.fetchone()[0]
c.execute(sql_select_nb_requests_by_service.format(table=table))
for row in c.fetchall():
result[table]['nb_' + row[0]] = row[1]
c.execute(sql_select_nb_requests.format(table=table))
result[table]['nb_all'] = c.fetchone()[0]
c.execute(sql_select_content_size_by_service.format(table=table))
for row in c.fetchall():
result[table]['size_' + row[0]] = row[1]
c.execute(sql_select_content_size.format(table=table))
result[table]['size_all'] = c.fetchone()[0]
end_table = datetime.datetime.now()
print("Duration:", str(end_table - start_table).split('.')[0])
return result
def print_result(result):
print('-'*80)
lines = []
first = True
for table in result:
if first:
line = ['TABLE']
line.extend([str(key).upper() for key in result[table]])
lines.append(','.join(line))
first = False
line = [table]
line.extend([str(result[table][key]) for key in result[table]])
lines.append(','.join(line))
print('\n'.join(lines))
print('-'*80)
if __name__ == "__main__":
print('=' * 80)
start = datetime.datetime.now()
print("Start:", start.strftime("%H:%M:%S"))
result = read_tables(DB_FILE)
print_result(result)
end = datetime.datetime.now()
print("End:", end.strftime("%H:%M:%S"))
print("Total duration:", str(end - start).split('.')[0])
print('=' * 80)

41
sqlite_to_parquet.py Normal file
View File

@ -0,0 +1,41 @@
import os
import duckdb
duckdb.sql('''
ATTACH './db/dge_traefik_logs.db3' AS logs;
USE logs;
''')
r = duckdb.sql('''
SHOW TABLES;
''')
print(r)
tables = duckdb.sql('''
SHOW TABLES;
''').pl().get_column('name').to_list()
# print(tables[:10])
for table in tables:
print(table)
if not os.path.isfile(table + '.parquet'):
duckdb.sql(
f'''
COPY (SELECT * FROM logs.{table})
TO '{table}.parquet' (FORMAT 'parquet')
'''
)
nb = duckdb.sql(
f'''
FROM '{table}.parquet'
SELECT COUNT(*)
''')
print(nb)