version 20230312

2023-03-12 09:03:01 +01:00 · 2023-03-12 09:03:01 +01:00 · 32d682383b
commit 32d682383b
parent 1dcba6a0bf
4 changed files with 209 additions and 28 deletions
--- a/README.md
+++ b/README.md
@ -3,8 +3,17 @@
 Script Python permettant de convertir une liste de fichiers de budget au format Totem XML, en fichiers CSV conformes au schéma national du SCDL pour les données budgétaires.

 Utilisation:
+
+Récupération des plan de comptes:
+```
+> python get_pdc.py
+```
+
+Conversion de fichoier XML Totem en CSV
 ```
 > python run.py file1.xml [file2.xml file2.xml ... filen.xml]
+> # ou
+> python run.py directory
 ```

 Les fichiers générés sont localisés au même endroit que chaque fichier XML et aura le même nom.
@ -16,7 +25,14 @@ Ce script s'appuie sur les plans de compte diffusés sur http://odm-budgetaire.o

 Il n'utilise pas de XSL pour convertir les fichiers, mais converti le XML en dictionaire Python pour traiter les informations.

-Il réalise la même opération que l'outil "DataClic" proposé par OpenDataFrance: https://dataclic.fr/budget. Les calculs sont basés sur le script https://gitlab.com/datafin/totem/-/blob/master/totem2csv/xsl/totem2xmlcsv.xsl. Il se limite cependant à la conversion des lignes de budget et ne réalise pars l'anonymisation du fichier Totem XML.
+Il s'inspire fortement de l'outil "DataClic" proposé par OpenDataFrance: https://dataclic.fr/budget. Les calculs de valeurs sont basés sur le script https://gitlab.com/datafin/totem/-/blob/master/totem2csv/xsl/totem2xmlcsv.xsl. Il se limite cependant à la conversion des lignes de budget et ne réalise pas l'anonymisation du fichier Totem XML.
+
+Un problème à cependant été identifié sur l'outils DataClic pour l'identification des labels de nature et fonction à partir des codes correspondant.
+Il smeble que certaines lignes du budget dépendent de la nomenclature de l'année précédente. Ainsi, en utilisant le plan de compte d'une seule année, certains labels ne peuvent pas être renseignée.
+Pour contourner ce problème le script de converrsion utilise un fichier de référence pour chaque type de plan de compte, constitué par la capitalisation pluri annuelle des nomenclature (période 2015/2023). Ces fichies sont disponibles dans le dossier "pdc" et peuvent être générés/mis à jour via le script get_pdc.py.
+La génération de ces nomenclatures spécifiques s'appuient sur les hypothèses suivante:
+- Un code correspond toujours au même libellé dans un type de plan de compte (ex.: M14, M71, M57, etc.). Il peut être supprimé d'une nomenclature mais n'est pas réutilisé pour un autre libellé.
+- Il n'y a pas de doublon de codes et libellés entre les sous-types de plans de compte (ex.: entre M57, M57_A, M57_D et M57_P)

 ## Modules utilisés

@ -42,7 +58,12 @@ $ .\venv\Scripts\activate  # 'source ./venv/bin/activate' sur linux
 $ python -m pip install -r requirements.txt
 ```

-Lancer le script.
+Le script "get_pdc.py" permet de mettre à jour la liste des chapitres et comptes de chaque plan de compte en les téléchargeant et les enregistrant au format JSON dans le dossier "pdc".
+```
+> python get_pdc.py
+```
+
+Le script "run.py" permet de convertir un ou plusieurs fichier XML Totem en CSV.
 ```
 $ python run.py monfichier.xml
 ```
--- a/get_pdc.py
+++ b/get_pdc.py
@ -0,0 +1,133 @@
+import sys
+import os
+import json
+import csv
+import xmltodict
+import jmespath
+import requests
+import pprint
+
+pdc_years = ['2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023']
+pdc_codes = {
+    'M14': ['M14_CCAS_INF3500', 'M14_CCAS_SUP3500', 'M14_CE', 'M14_COM_500_3500', 'M14_COM_INF500', 'M14_COM_SUP3500'],
+    'M4': ['M4', 'M41', 'M43', 'M49_A', 'M49_D'],
+    'M52': ['M52'],
+    'M57': ['M57', 'M57_A', 'M57_D', 'M57_P'],
+    'M61': ['M61'],
+    'M71': ['M71'],
+}
+
+pdc_root_url = "http://odm-budgetaire.org/composants/normes"
+
+
+def get_pdc_urls(pdc):
+    print('Get URL of plan de compte {}.'.format(pdc))
+    pdc_urls = []
+    for pdc_year in pdc_years:
+        for pdc_code in pdc_codes[pdc]: 
+            pdc_urls.append(pdc_root_url + '/{year}/{c1}/{c2}/planDeCompte.xml'.format(year=pdc_year, c1=pdc, c2=pdc_code))
+    return pdc_urls
+
+
+def get_children(parents, children_name, children_path, results):
+    """Get all children from a dictonary and return a list"""
+    
+    for parent in parents:
+        children = jmespath.search(children_path, parent)
+        if children is not None:
+            get_children(children, children_name, children_path, results)
+            del parent[children_name]
+            results.append(parent)
+        else:
+            results.append(parent)
+    return results
+
+
+def get_nature_chapitres(pdc_dict):
+    path_nomenclature_nature_chapitres = 'Nomenclature[0].Nature[0].Chapitres[0].Chapitre[*]'
+    nature_chapitres = jmespath.search(path_nomenclature_nature_chapitres, pdc_dict) or []
+    return nature_chapitres
+
+
+def get_nature_comptes(pdc_dict):
+    path_nomenclature_nature_comptes = 'Nomenclature[0].Nature[0].Comptes[0].Compte[*]'
+    path_comptes = 'Compte[*]'
+    nature_comptes = jmespath.search(path_nomenclature_nature_comptes, pdc_dict) or []
+    nature_comptes = get_children(nature_comptes, 'Compte', path_comptes, [])
+    return nature_comptes
+
+
+def get_fonction_chapitres(pdc_dict):
+    path_nomenclature_fonction_chapitres = 'Nomenclature[0].Fonction[0].Chapitres[0].Chapitre[*]'
+    fonction_chapitres = jmespath.search(path_nomenclature_fonction_chapitres, pdc_dict) or []
+    return fonction_chapitres
+
+
+def get_fonction_comptes(pdc_dict):
+    path_nomenclature_fonction_comptes = 'Nomenclature[0].Fonction[0].Comptes[0].Compte[*]'
+    path_comptes = 'Compte[*]'
+    fonction_comptes = jmespath.search(path_nomenclature_fonction_comptes, pdc_dict) or []
+    fonction_comptes = get_children(fonction_comptes, 'Compte', path_comptes, [])
+    return fonction_comptes
+
+
+def get_fonction_references(pdc_dict):
+    path_nomenclature_fonction_references = 'Nomenclature[0].Fonction[0].RefFonctionnelles[0].RefFonc[*]'
+    path_references = 'RefFonc[*]'
+    fonction_references = jmespath.search(path_nomenclature_fonction_references, pdc_dict) or []
+    fonction_references = get_children(fonction_references, 'RefFonc', path_references, [])
+    return fonction_references
+
+
+def clean_dict(pdc_dict):
+    codes = []
+    result = []
+    for d in pdc_dict:
+        if d['Code'] not in codes:
+            result.append(d)
+            codes.append(d['Code'])
+    return result
+
+
+def save_pdc(filename, pdc_dict):
+    pdc_dict = clean_dict(pdc_dict)
+    with open(filename, "w") as file:
+        json.dump(pdc_dict, file, indent=4)
+    print('Plan de compte {} saved.'.format(filename))
+
+
+def main():
+
+    for pdc in pdc_codes.keys():
+        pdc_urls = get_pdc_urls(pdc)
+
+        pdc_dict_nature_chapitres = []
+        pdc_dict_nature_comptes = []
+        pdc_dict_fonction_chapitres = []
+        pdc_dict_fonction_comptes = []
+        pdc_dict_fonction_references = []
+
+        print('Get plan de compte {}'.format(pdc))
+        for pdc_url in pdc_urls:
+            pdc_response = requests.get(pdc_url)
+            if pdc_response.status_code == 200:
+                # print("Plan de compte: {pdc_url} SUCCESS".format(pdc_url=pdc_url))
+                pdc_dict = xmltodict.parse(pdc_response.content, attr_prefix='', force_list=True)
+                pdc_dict_nature_chapitres = pdc_dict_nature_chapitres + get_nature_chapitres(pdc_dict)
+                pdc_dict_nature_comptes = pdc_dict_nature_comptes + get_nature_comptes(pdc_dict)
+                pdc_dict_fonction_chapitres = pdc_dict_fonction_chapitres + get_fonction_chapitres(pdc_dict)
+                pdc_dict_fonction_comptes = pdc_dict_fonction_comptes + get_fonction_comptes(pdc_dict)
+                pdc_dict_fonction_references = pdc_dict_fonction_references + get_fonction_references(pdc_dict)
+            else:
+                # print("Plan de compte: {pdc_url} ERROR".format(pdc_url=pdc_url))
+                pass
+        
+        save_pdc('./pdc/' + pdc + '_nature_chapitres.json', pdc_dict_nature_chapitres)
+        save_pdc('./pdc/' + pdc + '_nature_comptes.json', pdc_dict_nature_comptes)
+        save_pdc('./pdc/' + pdc + '_fonction_chapitres.json', pdc_dict_fonction_chapitres)
+        save_pdc('./pdc/' + pdc + '_fonction_comptes.json', pdc_dict_fonction_comptes)
+        save_pdc('./pdc/' + pdc + '_fonction_references.json', pdc_dict_fonction_references)
+
+
+if __name__ == "__main__":
+   main()
--- a/requirements.txt
+++ b/requirements.txt
--- a/run.py
+++ b/run.py
@ -12,6 +12,7 @@ Ainsi, "./mon/fichier1.xml" sera converti en "./mon/fichier1.csv".

 import sys
 import os
+import glob
 import json
 import csv
 import xmltodict
@ -34,7 +35,6 @@ paths = {
    'nomenclature_fonction_chapitres': 'Nomenclature[0].Fonction[0].Chapitres[0].Chapitre[*]',
    'nomenclature_fonction_references': 'Nomenclature[0].Fonction[0].RefFonctionnelles[0].RefFonc[*]',
    'references_fonctionnelles': 'RefFonc[*]',
-    
 }

 codes_natdec = {
@ -71,7 +71,8 @@ codes_artspe = {

 csv_header = ["BGT_NATDEC", "BGT_ANNEE", "BGT_SIRET", "BGT_NOM", "BGT_CONTNAT", "BGT_CONTNAT_LABEL", "BGT_NATURE", "BGT_NATURE_LABEL", "BGT_FONCTION", "BGT_FONCTION_LABEL", "BGT_OPERATION", "BGT_SECTION", "BGT_OPBUDG", "BGT_CODRD", "BGT_MTREAL", "BGT_MTBUDGPREC", "BGT_MTRARPREC", "BGT_MTPROPNOUV", "BGT_MTPREV", "BGT_CREDOUV", "BGT_MTRAR3112", "BGT_ARTSPE"]

-pdc_root_url = "http://odm-budgetaire.org/composants/normes"
+# pdc_root_url = "http://odm-budgetaire.org/composants/normes"
+pdc_directory = './pdc'


 def get_children(parents, children_name, children_path, results):
@ -92,7 +93,8 @@ def line2csv(line, config):
    "Convert line to CSV"
    
    ContNat = jmespath.search('ContNat.V', line) or 'ERROR'
-    ContNat_Label = jmespath.search('[?Code==`"{ContNat}"`].Libelle'.format(ContNat=ContNat), config['nomenclature_nature_chapitres'])
+    # ContNat_Label = jmespath.search('[?Code==`"{ContNat}"`].Libelle'.format(ContNat=ContNat), config['nomenclature_nature_chapitres'])
+    ContNat_Label = jmespath.search('[?Code==`"{ContNat}"`].Libelle'.format(ContNat=ContNat), config['nature_chapitres'])
    ContNat_Label = ContNat_Label[0] if len(ContNat_Label) else 'ERROR'
    Nature = jmespath.search('Nature.V', line) or 'ERROR'
    Nature_Label = jmespath.search('[?Code==`"{Nature}"`].Libelle'.format(Nature=Nature), config['nature_comptes'])
@ -101,7 +103,8 @@ def line2csv(line, config):
    Fonction_Label = jmespath.search('[?Code==`"{Fonction}"`].Libelle'.format(Fonction=Fonction), config['fonction_references'])
    Fonction_Label = Fonction_Label[0] if len(Fonction_Label) else ''
    Operation = jmespath.search('Operation.V', line) or ''
-    Section_Code = jmespath.search('[?Code==`"{ContNat}"`].Section'.format(ContNat=ContNat), config['nomenclature_nature_chapitres'])
+    # Section_Code = jmespath.search('[?Code==`"{ContNat}"`].Section'.format(ContNat=ContNat), config['nomenclature_nature_chapitres'])
+    Section_Code = jmespath.search('[?Code==`"{ContNat}"`].Section'.format(ContNat=ContNat), config['nature_chapitres'])
    Section = codes_section[Section_Code[0]] if len(Section_Code) else 'ERROR'
    OpBudg_Code = jmespath.search('OpBudg.V', line) or 'ERROR'
    OpBudg = codes_opbudg[OpBudg_Code]
@ -150,6 +153,12 @@ def line2csv(line, config):
    }


+def get_pdc(file):
+    with open(file, "r") as f:
+        pdc = json.load(f)
+    return pdc
+
+
 def totem2csv(xml_file):
    """Convert a Totem XML file to CSV according SCDL schema"""
    
@ -161,23 +170,35 @@ def totem2csv(xml_file):
        xml = xf.read()
    xml_dict = xmltodict.parse(xml, attr_prefix='', force_list=False)

-    # Define plan de compte
-    year = jmespath.search(paths['budget_year'], xml_dict)
-    nomenclature = jmespath.search(paths['budget_nomenclature'], xml_dict).split('-', 1)
-    pdc_url = pdc_root_url + '/{year}/{n1}/{n2}/planDeCompte.xml'.format(year=year, n1=nomenclature[0], n2=nomenclature[1])
-    print("Plan de compte: {pdc_url}".format(pdc_url=pdc_url))
+    pdc = jmespath.search(paths['budget_nomenclature'], xml_dict).split('-', 1)[0]

-    # Get XML plan de compte from pdc_url
-    pdc_response = requests.get(pdc_url)
-    pdc_dict = xmltodict.parse(pdc_response.content, attr_prefix='', force_list=True)
+    # # Define plan de compte
+    # year = jmespath.search(paths['budget_year'], xml_dict)
+    # nomenclature = jmespath.search(paths['budget_nomenclature'], xml_dict).split('-', 1)
+    # pdc_url = pdc_root_url + '/{year}/{n1}/{n2}/planDeCompte.xml'.format(year=year, n1=nomenclature[0], n2=nomenclature[1])
+    # print("Plan de compte: {pdc_url}".format(pdc_url=pdc_url))

-    # Read plan de compte: get "nature chapitres", "nature comptes" et "fonction references"
-    nomenclature_nature_chapitres = jmespath.search(paths['nomenclature_nature_chapitres'], pdc_dict)
-    nomenclature_nature_comptes = jmespath.search(paths['nomenclature_nature_comptes'], pdc_dict)
-    nomenclature_fonction_references = jmespath.search(paths['nomenclature_fonction_references'], pdc_dict)
-    # Get "nature comptes" and "fonction references" lists
-    nature_comptes = get_children(nomenclature_nature_comptes, 'Compte', paths['comptes'], [])
-    fonction_references = get_children(nomenclature_fonction_references, 'RefFonc', paths['references_fonctionnelles'], [])
+    # # Get XML plan de compte from pdc_url
+    # pdc_response = requests.get(pdc_url)
+    # pdc_dict = xmltodict.parse(pdc_response.content, attr_prefix='', force_list=True)
+
+    # # Read plan de compte: get "nature chapitres", "nature comptes" et "fonction references"
+    # nomenclature_nature_chapitres = jmespath.search(paths['nomenclature_nature_chapitres'], pdc_dict)
+    # nomenclature_nature_comptes = jmespath.search(paths['nomenclature_nature_comptes'], pdc_dict)
+    # nomenclature_fonction_references = jmespath.search(paths['nomenclature_fonction_references'], pdc_dict)
+    # # Get "nature comptes" and "fonction references" lists
+    # nature_comptes = get_children(nomenclature_nature_comptes, 'Compte', paths['comptes'], [])
+    # fonction_references = get_children(nomenclature_fonction_references, 'RefFonc', paths['references_fonctionnelles'], [])
+
+    # nomenclature_nature_chapitres
+    # nomenclature_nature_comptes
+    # nomenclature_fonction_references
+
+    nature_chapitres = get_pdc('./pdc/' + pdc + '_nature_chapitres.json')
+    nature_comptes = get_pdc('./pdc/' + pdc + '_nature_comptes.json')
+    fonction_chapitres = get_pdc('./pdc/' + pdc + '_fonction_chapitres.json')
+    fonction_comptes = get_pdc('./pdc/' + pdc + '_fonction_comptes.json')
+    fonction_references = get_pdc('./pdc/' + pdc + '_fonction_references.json')

    # Get main informations from XML header
    NatDec_Code = jmespath.search(paths['budget_natdec'], xml_dict) or 'ERROR'
@ -187,7 +208,8 @@ def totem2csv(xml_file):
    LibelleColl = jmespath.search(paths['budget_libellecoll'], xml_dict) or 'ERROR'
    
    config = {
-        'nomenclature_nature_chapitres': nomenclature_nature_chapitres,
+        # 'nomenclature_nature_chapitres': nomenclature_nature_chapitres,
+        'nature_chapitres': nature_chapitres,
        'nature_comptes': nature_comptes,
        'fonction_references': fonction_references,
        'NatDec': NatDec,
@ -224,13 +246,18 @@ def main(argv):
    if len(argv) == 0:
        print('No XML file in command line')
        sys.exit()
+
+    if len(argv) == 1 and os.path.isdir(argv):
+        files = glob.glob(argv + '/**/*.xml', recursive=True)
+    else:
+        files = argv
    
-    for xml_file in argv:
-        if not os.path.isfile(xml_file) or not xml_file.endswith('.xml'):
-            print('File {xml_file} not exist or is not XML file'.format(xml_file=xml_file))
-            sys.exit()
-        
-        totem2csv(xml_file)
+    for file in files:
+        if not os.path.isfile(file) or not file.endswith('.xml'):
+            print('{file} not exist or is not XML file'.format(file=file))
+        else:
+            print('Convert {file} to CSV'.format(file=file))     
+            totem2csv(file)


 if __name__ == "__main__":