first commit

2023-01-21 21:55:17 +01:00 · 2023-01-21 21:55:17 +01:00 · 730f4146ae
commit 730f4146ae
4 changed files with 418 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,169 @@
 # ---> Python
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 *.py,cover
 .hypothesis/
 .pytest_cache/
 cover/
 # Translations
 *.mo
 *.pot
 # Django stuff:
 *.log
 local_settings.py
 db.sqlite3
 db.sqlite3-journal
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 .pybuilder/
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # IPython
 profile_default/
 ipython_config.py
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
 # .python-version
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 #   install all needed dependencies.
 #Pipfile.lock
 # poetry
 #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 #   This is especially recommended for binary packages to ensure reproducibility, and is more
 #   commonly ignored for libraries.
 #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
 #poetry.lock
 # pdm
 #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
 #pdm.lock
 #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
 #   in version control.
 #   https://pdm.fming.dev/#use-with-ide
 .pdm.toml
 # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
 __pypackages__/
 # Celery stuff
 celerybeat-schedule
 celerybeat.pid
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 # Spyder project settings
 .spyderproject
 .spyproject
 # Rope project settings
 .ropeproject
 # mkdocs documentation
 /site
 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
 # Pyre type checker
 .pyre/
 # pytype static type analyzer
 .pytype/
 # Cython debug symbols
 cython_debug/
 # PyCharm
 #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
 # Data files
 *.csv
 *.xml
 # *.txt
 *.ipynb
 *.json
 _*.*
--- a/README.md
+++ b/README.md
@ -0,0 +1,11 @@
 # Totem2XML
 Script Python permettant de convertir une liste de fichiers de budget au format Totem XML, en fichiers CSV conformes au schéma national du SCDL pour les données budgétaires.
 Utilisation:
 ```
 > python run.py file1.xml [file2.xml file2.xml ... filen.xml]
 ```
 Les fichiers générés sont localisés au même endroit que chaque fichier XML et aura le même nom.
 Ainsi, `./mon/fichier1.xml` sera converti en `./mon/fichier1.csv`.
--- a/requirements.txt
+++ b/requirements.txt
--- a/run.py
+++ b/run.py
@ -0,0 +1,238 @@
 #!/usr/bin/env python3
 '''Totem2XML
 Script Python permettant de convertir une liste de fichiers de budget au format Totem XML, en fichiers CSV conformes au schéma national du SCDL pour les données budgétaires.
 Utilisation:
 > python run.py file1.xml [file2.xml file2.xml ... filen.xml]
 Les fichiers générés sont localisés au même endroit que chaque fichier XML et aura le même nom.
 Ainsi, "./mon/fichier1.xml" sera converti en "./mon/fichier1.csv".
 '''
 import sys
 import os
 import json
 import csv
 import xmltodict
 import jmespath
 import requests
 paths = {
    'budget_libellecoll': 'DocumentBudgetaire.EnTeteDocBudgetaire.LibelleColl.V',
    'budget_nomenclature': 'DocumentBudgetaire.Budget.EnTeteBudget.Nomenclature.V',
    'budget_idetab': 'DocumentBudgetaire.Budget.EnTeteBudget.IdEtab.V',
    'budget_year': 'DocumentBudgetaire.Budget.BlocBudget.Exer.V',
    'budget_natdec': 'DocumentBudgetaire.Budget.BlocBudget.NatDec.V',
    'budget_lines': 'DocumentBudgetaire.Budget.LigneBudget[*]',
    'nomenclature_nature_chapitres': 'Nomenclature[0].Nature[0].Chapitres[0].Chapitre[*]',
    'nomenclature_nature_comptes': 'Nomenclature[0].Nature[0].Comptes[0].Compte[*]',
    'comptes': 'Compte[*]',
    'nomenclature_fonction_chapitres': 'Nomenclature[0].Fonction[0].Chapitres[0].Chapitre[*]',
    'nomenclature_fonction_references': 'Nomenclature[0].Fonction[0].RefFonctionnelles[0].RefFonc[*]',
    'references_fonctionnelles': 'RefFonc[*]',
 }
 codes_natdec = {
    '01': 'budget primitif',
    '02': 'décision modificative',
    '03': 'budget supplémentaire',
    '09': 'compte administratif',
    'ERROR': 'NetDec inconnu'
 }
 codes_section = {
    'I': 'investissement',
    'F': 'fonctionnement',
    'ERROR': 'ERROR'
 }
 codes_opbudg = {
    '0': 'réel',
    '1': 'ordre',
    'ERROR': 'ERROR'
 }
 codes_codrd = {
    'R': 'recette',
    'D': 'dépense',
    'ERROR': 'ERROR'
 }
 codes_artspe = {
    'false': 'non spécialisé',
    'true': 'spécialisé',
    'empty': ''
 }
 csv_header = ["BGT_NATDEC", "BGT_ANNEE", "BGT_SIRET", "BGT_NOM", "BGT_CONTNAT", "BGT_CONTNAT_LABEL", "BGT_NATURE", "BGT_NATURE_LABEL", "BGT_FONCTION", "BGT_FONCTION_LABEL", "BGT_OPERATION", "BGT_SECTION", "BGT_OPBUDG", "BGT_CODRD", "BGT_MTREAL", "BGT_MTBUDGPREC", "BGT_MTRARPREC", "BGT_MTPROPNOUV", "BGT_MTPREV", "BGT_CREDOUV", "BGT_MTRAR3112", "BGT_ARTSPE"]
 pdc_root_url = "http://odm-budgetaire.org/composants/normes"
 def get_children(parents, children_name, children_path, results):
    """Get all children from a dictonary and return a list"""
    for parent in parents:
        children = jmespath.search(children_path, parent)
        if children is not None:
            get_children(children, children_name, children_path, results)
            del parent[children_name]
            results.append(parent)
        else:
            results.append(parent)
    return results
 def line2csv(line, config):
    "Convert line to CSV"
    ContNat = jmespath.search('ContNat.V', line) or 'ERROR'
    ContNat_Label = jmespath.search('[?Code==`"{ContNat}"`].Libelle'.format(ContNat=ContNat), config['nomenclature_nature_chapitres'])
    ContNat_Label = ContNat_Label[0] if len(ContNat_Label) else 'ERROR'
    Nature = jmespath.search('Nature.V', line) or 'ERROR'
    Nature_Label = jmespath.search('[?Code==`"{Nature}"`].Libelle'.format(Nature=Nature), config['nature_comptes'])
    Nature_Label = Nature_Label[0] if len(Nature_Label) else 'ERROR'
    Fonction = jmespath.search('Fonction.V', line) or ''
    Fonction_Label = jmespath.search('[?Code==`"{Fonction}"`].Libelle'.format(Fonction=Fonction), config['fonction_references'])
    Fonction_Label = Fonction_Label[0] if len(Fonction_Label) else ''
    Operation = jmespath.search('Operation.V', line) or ''
    Section_Code = jmespath.search('[?Code==`"{ContNat}"`].Section'.format(ContNat=ContNat), config['nomenclature_nature_chapitres'])
    Section = codes_section[Section_Code[0]] if len(Section_Code) else 'ERROR'
    OpBudg_Code = jmespath.search('OpBudg.V', line) or 'ERROR'
    OpBudg = codes_opbudg[OpBudg_Code]
    CodRD_Code = jmespath.search('CodRD.V', line) or 'ERROR'
    CodRD = codes_codrd[CodRD_Code]
    MtReal = jmespath.search('MtReal.V', line) or ''
    MtBudgPrec = jmespath.search('MtBudgPrec.V', line) or ''
    MtRARPrec = jmespath.search('MtRARPrec.V', line) or ''
    MtPropNouv = jmespath.search('MtPropNouv.V', line) or ''
    MtPrev = jmespath.search('MtPrev.V', line) or ''
    CredOuv = jmespath.search('CredOuv.V', line) or ''
    MtRAR3112 = jmespath.search('MtRAR3112.V', line) or ''
    ArtSpe_Code = jmespath.search('ArtSpe.V', line) or 'empty'
    ArtSpe = codes_artspe[ArtSpe_Code]
    # # Debug:
    # if Section == 'ERROR':
    #     print(ContNat)
    #     print(Section_Code)
    #     print(Section)
    return {
        'BGT_NATDEC': config['NatDec'],
        # 'BGT_NUMDEC': '',
        'BGT_ANNEE': config['Exer'],
        'BGT_SIRET': config['IdEtab'],
        'BGT_NOM': config['LibelleColl'],
        'BGT_CONTNAT': ContNat,
        'BGT_CONTNAT_LABEL': ContNat_Label,
        'BGT_NATURE': Nature,
        'BGT_NATURE_LABEL': Nature_Label,
        'BGT_FONCTION': Fonction,
        'BGT_FONCTION_LABEL': Fonction_Label,
        'BGT_OPERATION': Operation,
        'BGT_SECTION': Section,
        'BGT_OPBUDG': OpBudg,
        'BGT_CODRD': CodRD,
        'BGT_MTREAL': MtReal,
        'BGT_MTBUDGPREC': MtBudgPrec,
        'BGT_MTRARPREC': MtRARPrec,
        'BGT_MTPROPNOUV': MtPropNouv,
        'BGT_MTPREV': MtPrev,
        'BGT_CREDOUV': CredOuv,
        'BGT_MTRAR3112': MtRAR3112,
        'BGT_ARTSPE': ArtSpe,
    }
 def totem2csv(xml_file):
    """Convert a Totem XML file to CSV according SCDL schema"""
    filename = xml_file[:-4]
    csv_file = filename + '.csv'
    # Read XML file
    with open(xml_file, 'r') as xf:
        xml = xf.read()
    xml_dict = xmltodict.parse(xml, attr_prefix='', force_list=False)
    # Define plan de compte
    year = jmespath.search(paths['budget_year'], xml_dict)
    nomenclature = jmespath.search(paths['budget_nomenclature'], xml_dict).split('-', 1)
    pdc_url = pdc_root_url + '/{year}/{n1}/{n2}/planDeCompte.xml'.format(year=year, n1=nomenclature[0], n2=nomenclature[1])
    print("Plan de compte: {pdc_url}".format(pdc_url=pdc_url))
    # Get XML plan de compte from pdc_url
    pdc_response = requests.get(pdc_url)
    pdc_dict = xmltodict.parse(pdc_response.content, attr_prefix='', force_list=True)
    # Read plan de compte: get "nature chapitres", "nature comptes" et "fonction references"
    nomenclature_nature_chapitres = jmespath.search(paths['nomenclature_nature_chapitres'], pdc_dict)
    nomenclature_nature_comptes = jmespath.search(paths['nomenclature_nature_comptes'], pdc_dict)
    nomenclature_fonction_references = jmespath.search(paths['nomenclature_fonction_references'], pdc_dict)
    # Get "nature comptes" and "fonction references" lists
    nature_comptes = get_children(nomenclature_nature_comptes, 'Compte', paths['comptes'], [])
    fonction_references = get_children(nomenclature_fonction_references, 'RefFonc', paths['references_fonctionnelles'], [])
    # Get main informations from XML header
    NatDec_Code = jmespath.search(paths['budget_natdec'], xml_dict) or 'ERROR'
    NatDec = codes_natdec[NatDec_Code]
    Exer = jmespath.search(paths['budget_year'], xml_dict) or 'ERROR'
    IdEtab = jmespath.search(paths['budget_idetab'], xml_dict) or 'ERROR'
    LibelleColl = jmespath.search(paths['budget_libellecoll'], xml_dict) or 'ERROR'
    config = {
        'nomenclature_nature_chapitres': nomenclature_nature_chapitres,
        'nature_comptes': nature_comptes,
        'fonction_references': fonction_references,
        'NatDec': NatDec,
        'Exer': Exer,
        'IdEtab': IdEtab,
        'LibelleColl': LibelleColl,
    }
    # Get budget lines
    budget_lines = jmespath.search(paths['budget_lines'], xml_dict)
    nb_xml_lines = len(budget_lines)
    print("{nb_xml_lines} lignes de budget dans le fichier {xml_file}.".format(nb_xml_lines=nb_xml_lines, xml_file=xml_file))
    # Open CSV file to save lines
    with open(csv_file, 'w', newline='', encoding="utf-8") as cf:
        csv_writer = csv.DictWriter(cf, fieldnames=csv_header, delimiter=',', quoting=csv.QUOTE_NONNUMERIC)
        csv_writer.writeheader()
        nb_csv_lines = 0
        # Generate and save CSV line 
        # Cf. process details in https://gitlab.com/datafin/totem/-/blob/master/totem2csv/xsl/totem2xmlcsv.xsl
        for id_line, line in enumerate(budget_lines):
            nb_csv_lines += 1
            print('Traitement et enregistrement de la ligne {id_line}/{nb_xml_lines}'.format(id_line=id_line+1, nb_xml_lines=nb_xml_lines))
            csv_line = line2csv(line, config)
            csv_writer.writerow(csv_line)
    print("{nb_csv_lines} lignes enregistrées dans le fichier {csv_file} sur {nb_xml_lines}.".format(nb_csv_lines=nb_csv_lines, csv_file=csv_file, nb_xml_lines=nb_xml_lines))
 def main(argv):
    if len(argv) == 0:
        print('No XML file in command line')
        sys.exit()
    for xml_file in argv:
        if not os.path.isfile(xml_file) or not xml_file.endswith('.xml'):
            print('File {xml_file} not exist or is not XML file'.format(xml_file=xml_file))
            sys.exit()
        totem2csv(xml_file)
 if __name__ == "__main__":
   main(sys.argv[1:])