From 730f4146aeb8d25c48c14413919d4395c6fc3b57 Mon Sep 17 00:00:00 2001 From: Guillaume RYCKELYNCK Date: Sat, 21 Jan 2023 21:55:17 +0100 Subject: [PATCH] first commit --- .gitignore | 169 +++++++++++++++++++++++++++++++++ README.md | 11 +++ requirements.txt | Bin 0 -> 1460 bytes run.py | 238 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 418 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 requirements.txt create mode 100644 run.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7b9e41e --- /dev/null +++ b/.gitignore @@ -0,0 +1,169 @@ +# ---> Python +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +# Data files +*.csv +*.xml +# *.txt +*.ipynb +*.json +_*.* \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..f0bf45b --- /dev/null +++ b/README.md @@ -0,0 +1,11 @@ +# Totem2XML + +Script Python permettant de convertir une liste de fichiers de budget au format Totem XML, en fichiers CSV conformes au schéma national du SCDL pour les données budgétaires. + +Utilisation: +``` +> python run.py file1.xml [file2.xml file2.xml ... filen.xml] +``` + +Les fichiers générés sont localisés au même endroit que chaque fichier XML et aura le même nom. +Ainsi, `./mon/fichier1.xml` sera converti en `./mon/fichier1.csv`. \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..0578d78df7a4b726bd438367d9009c147d37fc90 GIT binary patch literal 1460 zcmZXU-E!Jc41{&vnLY{~Z0D!B$UF26I>vT_2^hz?O~}Ky-A_6Plj$%HLnN(sSCaq! zsqJL5K1X}8&IbK@)Tgv>{uK7rzSzcUYZPzvz1N3~b;?+KMx)wED7`)D352x}z$yIi zeahPkV^Wt+*(>!s>es<>d;4${sw>seA^JDU+Iwd0xw-$1GVT)YQIW%iS)oYjkye-+ z`(@8I+S%j8?)2PV;^0r>fD4o7+$v69^v9Q3k+U$4(#Nmx363|8)7iT^HR^;5gZjZT zc%TIzdutV`J%Ylh?423zPl|TQUl^9j<*0@KY+ErH9O1h%cmjK;$~Ji)&Jha_c5`^r@~!|jk?~qmDr=D z`_vZ|%a%p{N79$CA3w6|XM1zME9u9DO1qoAI_vrm_L{0ugzLcsHLvO%(_h$@SKUuS ze)sCQTilImHYVnsRm25?zsw+RpX+SpKJ2bzyd9VZGtajVN7RQpJm=lm>upnAc$AKU htIYXLdxHb5Fyac~5W7G;ZUrXnN~0`zkW*ak=0B`2)Vcrw literal 0 HcmV?d00001 diff --git a/run.py b/run.py new file mode 100644 index 0000000..1259a35 --- /dev/null +++ b/run.py @@ -0,0 +1,238 @@ +#!/usr/bin/env python3 + +'''Totem2XML +Script Python permettant de convertir une liste de fichiers de budget au format Totem XML, en fichiers CSV conformes au schéma national du SCDL pour les données budgétaires. + +Utilisation: +> python run.py file1.xml [file2.xml file2.xml ... filen.xml] + +Les fichiers générés sont localisés au même endroit que chaque fichier XML et aura le même nom. +Ainsi, "./mon/fichier1.xml" sera converti en "./mon/fichier1.csv". +''' + +import sys +import os +import json +import csv +import xmltodict +import jmespath +import requests + + +paths = { + 'budget_libellecoll': 'DocumentBudgetaire.EnTeteDocBudgetaire.LibelleColl.V', + 'budget_nomenclature': 'DocumentBudgetaire.Budget.EnTeteBudget.Nomenclature.V', + 'budget_idetab': 'DocumentBudgetaire.Budget.EnTeteBudget.IdEtab.V', + 'budget_year': 'DocumentBudgetaire.Budget.BlocBudget.Exer.V', + 'budget_natdec': 'DocumentBudgetaire.Budget.BlocBudget.NatDec.V', + 'budget_lines': 'DocumentBudgetaire.Budget.LigneBudget[*]', + + 'nomenclature_nature_chapitres': 'Nomenclature[0].Nature[0].Chapitres[0].Chapitre[*]', + 'nomenclature_nature_comptes': 'Nomenclature[0].Nature[0].Comptes[0].Compte[*]', + 'comptes': 'Compte[*]', + + 'nomenclature_fonction_chapitres': 'Nomenclature[0].Fonction[0].Chapitres[0].Chapitre[*]', + 'nomenclature_fonction_references': 'Nomenclature[0].Fonction[0].RefFonctionnelles[0].RefFonc[*]', + 'references_fonctionnelles': 'RefFonc[*]', + +} + +codes_natdec = { + '01': 'budget primitif', + '02': 'décision modificative', + '03': 'budget supplémentaire', + '09': 'compte administratif', + 'ERROR': 'NetDec inconnu' +} + +codes_section = { + 'I': 'investissement', + 'F': 'fonctionnement', + 'ERROR': 'ERROR' +} + +codes_opbudg = { + '0': 'réel', + '1': 'ordre', + 'ERROR': 'ERROR' +} + +codes_codrd = { + 'R': 'recette', + 'D': 'dépense', + 'ERROR': 'ERROR' +} + +codes_artspe = { + 'false': 'non spécialisé', + 'true': 'spécialisé', + 'empty': '' +} + +csv_header = ["BGT_NATDEC", "BGT_ANNEE", "BGT_SIRET", "BGT_NOM", "BGT_CONTNAT", "BGT_CONTNAT_LABEL", "BGT_NATURE", "BGT_NATURE_LABEL", "BGT_FONCTION", "BGT_FONCTION_LABEL", "BGT_OPERATION", "BGT_SECTION", "BGT_OPBUDG", "BGT_CODRD", "BGT_MTREAL", "BGT_MTBUDGPREC", "BGT_MTRARPREC", "BGT_MTPROPNOUV", "BGT_MTPREV", "BGT_CREDOUV", "BGT_MTRAR3112", "BGT_ARTSPE"] + +pdc_root_url = "http://odm-budgetaire.org/composants/normes" + + +def get_children(parents, children_name, children_path, results): + """Get all children from a dictonary and return a list""" + + for parent in parents: + children = jmespath.search(children_path, parent) + if children is not None: + get_children(children, children_name, children_path, results) + del parent[children_name] + results.append(parent) + else: + results.append(parent) + return results + + +def line2csv(line, config): + "Convert line to CSV" + + ContNat = jmespath.search('ContNat.V', line) or 'ERROR' + ContNat_Label = jmespath.search('[?Code==`"{ContNat}"`].Libelle'.format(ContNat=ContNat), config['nomenclature_nature_chapitres']) + ContNat_Label = ContNat_Label[0] if len(ContNat_Label) else 'ERROR' + Nature = jmespath.search('Nature.V', line) or 'ERROR' + Nature_Label = jmespath.search('[?Code==`"{Nature}"`].Libelle'.format(Nature=Nature), config['nature_comptes']) + Nature_Label = Nature_Label[0] if len(Nature_Label) else 'ERROR' + Fonction = jmespath.search('Fonction.V', line) or '' + Fonction_Label = jmespath.search('[?Code==`"{Fonction}"`].Libelle'.format(Fonction=Fonction), config['fonction_references']) + Fonction_Label = Fonction_Label[0] if len(Fonction_Label) else '' + Operation = jmespath.search('Operation.V', line) or '' + Section_Code = jmespath.search('[?Code==`"{ContNat}"`].Section'.format(ContNat=ContNat), config['nomenclature_nature_chapitres']) + Section = codes_section[Section_Code[0]] if len(Section_Code) else 'ERROR' + OpBudg_Code = jmespath.search('OpBudg.V', line) or 'ERROR' + OpBudg = codes_opbudg[OpBudg_Code] + CodRD_Code = jmespath.search('CodRD.V', line) or 'ERROR' + CodRD = codes_codrd[CodRD_Code] + MtReal = jmespath.search('MtReal.V', line) or '' + MtBudgPrec = jmespath.search('MtBudgPrec.V', line) or '' + MtRARPrec = jmespath.search('MtRARPrec.V', line) or '' + MtPropNouv = jmespath.search('MtPropNouv.V', line) or '' + MtPrev = jmespath.search('MtPrev.V', line) or '' + CredOuv = jmespath.search('CredOuv.V', line) or '' + MtRAR3112 = jmespath.search('MtRAR3112.V', line) or '' + ArtSpe_Code = jmespath.search('ArtSpe.V', line) or 'empty' + ArtSpe = codes_artspe[ArtSpe_Code] + + # # Debug: + # if Section == 'ERROR': + # print(ContNat) + # print(Section_Code) + # print(Section) + + return { + 'BGT_NATDEC': config['NatDec'], + # 'BGT_NUMDEC': '', + 'BGT_ANNEE': config['Exer'], + 'BGT_SIRET': config['IdEtab'], + 'BGT_NOM': config['LibelleColl'], + 'BGT_CONTNAT': ContNat, + 'BGT_CONTNAT_LABEL': ContNat_Label, + 'BGT_NATURE': Nature, + 'BGT_NATURE_LABEL': Nature_Label, + 'BGT_FONCTION': Fonction, + 'BGT_FONCTION_LABEL': Fonction_Label, + 'BGT_OPERATION': Operation, + 'BGT_SECTION': Section, + 'BGT_OPBUDG': OpBudg, + 'BGT_CODRD': CodRD, + 'BGT_MTREAL': MtReal, + 'BGT_MTBUDGPREC': MtBudgPrec, + 'BGT_MTRARPREC': MtRARPrec, + 'BGT_MTPROPNOUV': MtPropNouv, + 'BGT_MTPREV': MtPrev, + 'BGT_CREDOUV': CredOuv, + 'BGT_MTRAR3112': MtRAR3112, + 'BGT_ARTSPE': ArtSpe, + } + + +def totem2csv(xml_file): + """Convert a Totem XML file to CSV according SCDL schema""" + + filename = xml_file[:-4] + csv_file = filename + '.csv' + + # Read XML file + with open(xml_file, 'r') as xf: + xml = xf.read() + xml_dict = xmltodict.parse(xml, attr_prefix='', force_list=False) + + # Define plan de compte + year = jmespath.search(paths['budget_year'], xml_dict) + nomenclature = jmespath.search(paths['budget_nomenclature'], xml_dict).split('-', 1) + pdc_url = pdc_root_url + '/{year}/{n1}/{n2}/planDeCompte.xml'.format(year=year, n1=nomenclature[0], n2=nomenclature[1]) + print("Plan de compte: {pdc_url}".format(pdc_url=pdc_url)) + + # Get XML plan de compte from pdc_url + pdc_response = requests.get(pdc_url) + pdc_dict = xmltodict.parse(pdc_response.content, attr_prefix='', force_list=True) + + # Read plan de compte: get "nature chapitres", "nature comptes" et "fonction references" + nomenclature_nature_chapitres = jmespath.search(paths['nomenclature_nature_chapitres'], pdc_dict) + nomenclature_nature_comptes = jmespath.search(paths['nomenclature_nature_comptes'], pdc_dict) + nomenclature_fonction_references = jmespath.search(paths['nomenclature_fonction_references'], pdc_dict) + # Get "nature comptes" and "fonction references" lists + nature_comptes = get_children(nomenclature_nature_comptes, 'Compte', paths['comptes'], []) + fonction_references = get_children(nomenclature_fonction_references, 'RefFonc', paths['references_fonctionnelles'], []) + + # Get main informations from XML header + NatDec_Code = jmespath.search(paths['budget_natdec'], xml_dict) or 'ERROR' + NatDec = codes_natdec[NatDec_Code] + Exer = jmespath.search(paths['budget_year'], xml_dict) or 'ERROR' + IdEtab = jmespath.search(paths['budget_idetab'], xml_dict) or 'ERROR' + LibelleColl = jmespath.search(paths['budget_libellecoll'], xml_dict) or 'ERROR' + + config = { + 'nomenclature_nature_chapitres': nomenclature_nature_chapitres, + 'nature_comptes': nature_comptes, + 'fonction_references': fonction_references, + 'NatDec': NatDec, + 'Exer': Exer, + 'IdEtab': IdEtab, + 'LibelleColl': LibelleColl, + } + + # Get budget lines + budget_lines = jmespath.search(paths['budget_lines'], xml_dict) + nb_xml_lines = len(budget_lines) + print("{nb_xml_lines} lignes de budget dans le fichier {xml_file}.".format(nb_xml_lines=nb_xml_lines, xml_file=xml_file)) + + # Open CSV file to save lines + with open(csv_file, 'w', newline='', encoding="utf-8") as cf: + csv_writer = csv.DictWriter(cf, fieldnames=csv_header, delimiter=',', quoting=csv.QUOTE_NONNUMERIC) + csv_writer.writeheader() + nb_csv_lines = 0 + + # Generate and save CSV line + # Cf. process details in https://gitlab.com/datafin/totem/-/blob/master/totem2csv/xsl/totem2xmlcsv.xsl + for id_line, line in enumerate(budget_lines): + + nb_csv_lines += 1 + print('Traitement et enregistrement de la ligne {id_line}/{nb_xml_lines}'.format(id_line=id_line+1, nb_xml_lines=nb_xml_lines)) + + csv_line = line2csv(line, config) + csv_writer.writerow(csv_line) + + print("{nb_csv_lines} lignes enregistrées dans le fichier {csv_file} sur {nb_xml_lines}.".format(nb_csv_lines=nb_csv_lines, csv_file=csv_file, nb_xml_lines=nb_xml_lines)) + + +def main(argv): + if len(argv) == 0: + print('No XML file in command line') + sys.exit() + + for xml_file in argv: + if not os.path.isfile(xml_file) or not xml_file.endswith('.xml'): + print('File {xml_file} not exist or is not XML file'.format(xml_file=xml_file)) + sys.exit() + + totem2csv(xml_file) + + +if __name__ == "__main__": + main(sys.argv[1:]) +