73 lines
3.1 KiB
Python
73 lines
3.1 KiB
Python
import datetime
|
|
|
|
import duckdb
|
|
|
|
pq_file = './parquet/logs.parquet'
|
|
|
|
sql_select_by_month = "SELECT year AS year, month AS month, COUNT(*) AS nb_lines, SUM(CAST(org_content_size as INTEGER)) AS size FROM read_parquet('{table}') GROUP BY year, month;"
|
|
sql_select_requests_by_service = "SELECT year AS year, month, ogc_service AS service, COUNT(*) AS requests FROM read_parquet('{table}') WHERE dge_application IN ('geoserver', 'geonetwork') and ogc_service IN ('wms', 'wfs', 'wmts', 'csw') GROUP BY year, month, ogc_service ORDER BY year, month, ogc_service;"
|
|
sql_select_size_by_service = "SELECT year AS year, month, ogc_service AS service, SUM(CAST(org_content_size as INTEGER)) AS size FROM read_parquet('{table}') WHERE dge_application IN ('geoserver', 'geonetwork') and ogc_service IN ('wms', 'wfs', 'wmts', 'csw') GROUP BY year, month, ogc_service ORDER BY year, month, ogc_service;"
|
|
|
|
|
|
def get_data(pq_file):
|
|
|
|
df_total = duckdb.sql(sql_select_by_month.format(table=pq_file)).df()
|
|
df_requests_by_service = duckdb.sql(sql_select_requests_by_service.format(table=pq_file)).df()
|
|
df_size_by_service = duckdb.sql(sql_select_size_by_service.format(table=pq_file)).df()
|
|
|
|
result= {}
|
|
|
|
for index, row in df_total.iterrows():
|
|
month = str(int(row['year'])) + '_' + str(int(row['month'])).rjust(2, '0')
|
|
result[month] = {
|
|
'nb_all': row['nb_lines'],
|
|
'size_all': row['size'],
|
|
}
|
|
|
|
for index, row in df_requests_by_service.iterrows():
|
|
month = str(int(row['year'])) + '_' + str(int(row['month'])).rjust(2, '0')
|
|
result[month]['nb_services'] = 0 if 'nb_services' not in result[month] else result[month]['nb_services'] + row['requests']
|
|
result[month]['nb_' + row['service']] = row['requests']
|
|
|
|
for index, row in df_size_by_service.iterrows():
|
|
month = str(int(row['year'])) + '_' + str(int(row['month'])).rjust(2, '0')
|
|
result[month]['size_services'] = 0 if 'size_services' not in result[month] else result[month]['size_services'] + row['size']
|
|
result[month]['size_' + row['service']] = row['size']
|
|
|
|
for row in result:
|
|
result[row]['nb_other'] = result[row]['nb_all'] - result[row]['nb_services']
|
|
result[row]['size_other'] = result[row]['size_all'] - result[row]['size_services']
|
|
|
|
return result
|
|
|
|
|
|
def print_result(result):
|
|
print('-'*80)
|
|
lines = []
|
|
first = True
|
|
for month in result:
|
|
if first:
|
|
line = ['MONTH']
|
|
line.extend([str(key).upper() for key in result[month]])
|
|
lines.append(','.join(line))
|
|
first = False
|
|
line = [month]
|
|
line.extend([str(int(result[month][key])) for key in result[month]])
|
|
lines.append(','.join(line))
|
|
print('\n'.join(lines))
|
|
print('-'*80)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
print('=' * 80)
|
|
start = datetime.datetime.now()
|
|
print("Start:", start.strftime("%H:%M:%S"))
|
|
|
|
result = get_data(pq_file)
|
|
print_result(result)
|
|
|
|
end = datetime.datetime.now()
|
|
print("End:", end.strftime("%H:%M:%S"))
|
|
print("Total duration:", str(end - start).split('.')[0])
|
|
print('=' * 80)
|