From 32d682383b1b5f696002939b7e45c9bd9a5122b3 Mon Sep 17 00:00:00 2001
From: Guillaume RYCKELYNCK <guillaume.ryckelynck@grandest.fr>
Date: Sun, 12 Mar 2023 09:03:01 +0100
Subject: [PATCH] version 20230312

---
 README.md        |  25 ++++++++-
 get_pdc.py       | 133 +++++++++++++++++++++++++++++++++++++++++++++++
 requirements.txt | Bin 1460 -> 1462 bytes
 run.py           |  79 +++++++++++++++++++---------
 4 files changed, 209 insertions(+), 28 deletions(-)
 create mode 100644 get_pdc.py

diff --git a/README.md b/README.md
index 317eab2..a8a2761 100644
--- a/README.md
+++ b/README.md
@@ -3,8 +3,17 @@
 Script Python permettant de convertir une liste de fichiers de budget au format Totem XML, en fichiers CSV conformes au schéma national du SCDL pour les données budgétaires.
 
 Utilisation:
+
+Récupération des plan de comptes:
+```
+> python get_pdc.py
+```
+
+Conversion de fichoier XML Totem en CSV
 ```
 > python run.py file1.xml [file2.xml file2.xml ... filen.xml]
+> # ou
+> python run.py directory
 ```
 
 Les fichiers générés sont localisés au même endroit que chaque fichier XML et aura le même nom.
@@ -16,7 +25,14 @@ Ce script s'appuie sur les plans de compte diffusés sur http://odm-budgetaire.o
 
 Il n'utilise pas de XSL pour convertir les fichiers, mais converti le XML en dictionaire Python pour traiter les informations.
 
-Il réalise la même opération que l'outil "DataClic" proposé par OpenDataFrance: https://dataclic.fr/budget. Les calculs sont basés sur le script https://gitlab.com/datafin/totem/-/blob/master/totem2csv/xsl/totem2xmlcsv.xsl. Il se limite cependant à la conversion des lignes de budget et ne réalise pars l'anonymisation du fichier Totem XML.
+Il s'inspire fortement de l'outil "DataClic" proposé par OpenDataFrance: https://dataclic.fr/budget. Les calculs de valeurs sont basés sur le script https://gitlab.com/datafin/totem/-/blob/master/totem2csv/xsl/totem2xmlcsv.xsl. Il se limite cependant à la conversion des lignes de budget et ne réalise pas l'anonymisation du fichier Totem XML.
+
+Un problème à cependant été identifié sur l'outils DataClic pour l'identification des labels de nature et fonction à partir des codes correspondant.
+Il smeble que certaines lignes du budget dépendent de la nomenclature de l'année précédente. Ainsi, en utilisant le plan de compte d'une seule année, certains labels ne peuvent pas être renseignée.
+Pour contourner ce problème le script de converrsion utilise un fichier de référence pour chaque type de plan de compte, constitué par la capitalisation pluri annuelle des nomenclature (période 2015/2023). Ces fichies sont disponibles dans le dossier "pdc" et peuvent être générés/mis à jour via le script get_pdc.py.
+La génération de ces nomenclatures spécifiques s'appuient sur les hypothèses suivante:
+- Un code correspond toujours au même libellé dans un type de plan de compte (ex.: M14, M71, M57, etc.). Il peut être supprimé d'une nomenclature mais n'est pas réutilisé pour un autre libellé.
+- Il n'y a pas de doublon de codes et libellés entre les sous-types de plans de compte (ex.: entre M57, M57_A, M57_D et M57_P)
 
 ## Modules utilisés
 
@@ -42,7 +58,12 @@ $ .\venv\Scripts\activate  # 'source ./venv/bin/activate' sur linux
 $ python -m pip install -r requirements.txt
 ```
 
-Lancer le script.
+Le script "get_pdc.py" permet de mettre à jour la liste des chapitres et comptes de chaque plan de compte en les téléchargeant et les enregistrant au format JSON dans le dossier "pdc".
+```
+> python get_pdc.py
+```
+
+Le script "run.py" permet de convertir un ou plusieurs fichier XML Totem en CSV.
 ```
 $ python run.py monfichier.xml
 ```
diff --git a/get_pdc.py b/get_pdc.py
new file mode 100644
index 0000000..325e462
--- /dev/null
+++ b/get_pdc.py
@@ -0,0 +1,133 @@
+import sys
+import os
+import json
+import csv
+import xmltodict
+import jmespath
+import requests
+import pprint
+
+pdc_years = ['2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023']
+pdc_codes = {
+    'M14': ['M14_CCAS_INF3500', 'M14_CCAS_SUP3500', 'M14_CE', 'M14_COM_500_3500', 'M14_COM_INF500', 'M14_COM_SUP3500'],
+    'M4': ['M4', 'M41', 'M43', 'M49_A', 'M49_D'],
+    'M52': ['M52'],
+    'M57': ['M57', 'M57_A', 'M57_D', 'M57_P'],
+    'M61': ['M61'],
+    'M71': ['M71'],
+}
+
+pdc_root_url = "http://odm-budgetaire.org/composants/normes"
+
+
+def get_pdc_urls(pdc):
+    print('Get URL of plan de compte {}.'.format(pdc))
+    pdc_urls = []
+    for pdc_year in pdc_years:
+        for pdc_code in pdc_codes[pdc]: 
+            pdc_urls.append(pdc_root_url + '/{year}/{c1}/{c2}/planDeCompte.xml'.format(year=pdc_year, c1=pdc, c2=pdc_code))
+    return pdc_urls
+
+
+def get_children(parents, children_name, children_path, results):
+    """Get all children from a dictonary and return a list"""
+    
+    for parent in parents:
+        children = jmespath.search(children_path, parent)
+        if children is not None:
+            get_children(children, children_name, children_path, results)
+            del parent[children_name]
+            results.append(parent)
+        else:
+            results.append(parent)
+    return results
+
+
+def get_nature_chapitres(pdc_dict):
+    path_nomenclature_nature_chapitres = 'Nomenclature[0].Nature[0].Chapitres[0].Chapitre[*]'
+    nature_chapitres = jmespath.search(path_nomenclature_nature_chapitres, pdc_dict) or []
+    return nature_chapitres
+
+
+def get_nature_comptes(pdc_dict):
+    path_nomenclature_nature_comptes = 'Nomenclature[0].Nature[0].Comptes[0].Compte[*]'
+    path_comptes = 'Compte[*]'
+    nature_comptes = jmespath.search(path_nomenclature_nature_comptes, pdc_dict) or []
+    nature_comptes = get_children(nature_comptes, 'Compte', path_comptes, [])
+    return nature_comptes
+
+
+def get_fonction_chapitres(pdc_dict):
+    path_nomenclature_fonction_chapitres = 'Nomenclature[0].Fonction[0].Chapitres[0].Chapitre[*]'
+    fonction_chapitres = jmespath.search(path_nomenclature_fonction_chapitres, pdc_dict) or []
+    return fonction_chapitres
+
+
+def get_fonction_comptes(pdc_dict):
+    path_nomenclature_fonction_comptes = 'Nomenclature[0].Fonction[0].Comptes[0].Compte[*]'
+    path_comptes = 'Compte[*]'
+    fonction_comptes = jmespath.search(path_nomenclature_fonction_comptes, pdc_dict) or []
+    fonction_comptes = get_children(fonction_comptes, 'Compte', path_comptes, [])
+    return fonction_comptes
+
+
+def get_fonction_references(pdc_dict):
+    path_nomenclature_fonction_references = 'Nomenclature[0].Fonction[0].RefFonctionnelles[0].RefFonc[*]'
+    path_references = 'RefFonc[*]'
+    fonction_references = jmespath.search(path_nomenclature_fonction_references, pdc_dict) or []
+    fonction_references = get_children(fonction_references, 'RefFonc', path_references, [])
+    return fonction_references
+
+
+def clean_dict(pdc_dict):
+    codes = []
+    result = []
+    for d in pdc_dict:
+        if d['Code'] not in codes:
+            result.append(d)
+            codes.append(d['Code'])
+    return result
+
+
+def save_pdc(filename, pdc_dict):
+    pdc_dict = clean_dict(pdc_dict)
+    with open(filename, "w") as file:
+        json.dump(pdc_dict, file, indent=4)
+    print('Plan de compte {} saved.'.format(filename))
+
+
+def main():
+
+    for pdc in pdc_codes.keys():
+        pdc_urls = get_pdc_urls(pdc)
+
+        pdc_dict_nature_chapitres = []
+        pdc_dict_nature_comptes = []
+        pdc_dict_fonction_chapitres = []
+        pdc_dict_fonction_comptes = []
+        pdc_dict_fonction_references = []
+
+        print('Get plan de compte {}'.format(pdc))
+        for pdc_url in pdc_urls:
+            pdc_response = requests.get(pdc_url)
+            if pdc_response.status_code == 200:
+                # print("Plan de compte: {pdc_url} SUCCESS".format(pdc_url=pdc_url))
+                pdc_dict = xmltodict.parse(pdc_response.content, attr_prefix='', force_list=True)
+                pdc_dict_nature_chapitres = pdc_dict_nature_chapitres + get_nature_chapitres(pdc_dict)
+                pdc_dict_nature_comptes = pdc_dict_nature_comptes + get_nature_comptes(pdc_dict)
+                pdc_dict_fonction_chapitres = pdc_dict_fonction_chapitres + get_fonction_chapitres(pdc_dict)
+                pdc_dict_fonction_comptes = pdc_dict_fonction_comptes + get_fonction_comptes(pdc_dict)
+                pdc_dict_fonction_references = pdc_dict_fonction_references + get_fonction_references(pdc_dict)
+            else:
+                # print("Plan de compte: {pdc_url} ERROR".format(pdc_url=pdc_url))
+                pass
+        
+        save_pdc('./pdc/' + pdc + '_nature_chapitres.json', pdc_dict_nature_chapitres)
+        save_pdc('./pdc/' + pdc + '_nature_comptes.json', pdc_dict_nature_comptes)
+        save_pdc('./pdc/' + pdc + '_fonction_chapitres.json', pdc_dict_fonction_chapitres)
+        save_pdc('./pdc/' + pdc + '_fonction_comptes.json', pdc_dict_fonction_comptes)
+        save_pdc('./pdc/' + pdc + '_fonction_references.json', pdc_dict_fonction_references)
+
+
+if __name__ == "__main__":
+   main()
diff --git a/requirements.txt b/requirements.txt
index 0578d78df7a4b726bd438367d9009c147d37fc90..9970b1d1750bfeeb0e8fb4e92e4117fb78291f66 100644
GIT binary patch
delta 72
zcmdnOy^VXr0!C#+20aF223`g(hD?S6hDwGKh75*$hCBvaAhZC=88R47KFFBP31S<7
T)J`^JQe`y)@is>@MKS{b53CK7

delta 70
zcmdnSy@h+j0!Bpx20aEN23`g(hD?S6hDwGKh75*$hCBvaAhZC=Sxi36n9gYqWSIc5
T<z!<fRaQeF%Xo7vQzSD0`$r9S

diff --git a/run.py b/run.py
index 1259a35..bd9e99c 100644
--- a/run.py
+++ b/run.py
@@ -12,6 +12,7 @@ Ainsi, "./mon/fichier1.xml" sera converti en "./mon/fichier1.csv".
 
 import sys
 import os
+import glob
 import json
 import csv
 import xmltodict
@@ -34,7 +35,6 @@ paths = {
     'nomenclature_fonction_chapitres': 'Nomenclature[0].Fonction[0].Chapitres[0].Chapitre[*]',
     'nomenclature_fonction_references': 'Nomenclature[0].Fonction[0].RefFonctionnelles[0].RefFonc[*]',
     'references_fonctionnelles': 'RefFonc[*]',
-    
 }
 
 codes_natdec = {
@@ -71,7 +71,8 @@ codes_artspe = {
 
 csv_header = ["BGT_NATDEC", "BGT_ANNEE", "BGT_SIRET", "BGT_NOM", "BGT_CONTNAT", "BGT_CONTNAT_LABEL", "BGT_NATURE", "BGT_NATURE_LABEL", "BGT_FONCTION", "BGT_FONCTION_LABEL", "BGT_OPERATION", "BGT_SECTION", "BGT_OPBUDG", "BGT_CODRD", "BGT_MTREAL", "BGT_MTBUDGPREC", "BGT_MTRARPREC", "BGT_MTPROPNOUV", "BGT_MTPREV", "BGT_CREDOUV", "BGT_MTRAR3112", "BGT_ARTSPE"]
 
-pdc_root_url = "http://odm-budgetaire.org/composants/normes"
+# pdc_root_url = "http://odm-budgetaire.org/composants/normes"
+pdc_directory = './pdc'
 
 
 def get_children(parents, children_name, children_path, results):
@@ -92,7 +93,8 @@ def line2csv(line, config):
     "Convert line to CSV"
     
     ContNat = jmespath.search('ContNat.V', line) or 'ERROR'
-    ContNat_Label = jmespath.search('[?Code==`"{ContNat}"`].Libelle'.format(ContNat=ContNat), config['nomenclature_nature_chapitres'])
+    # ContNat_Label = jmespath.search('[?Code==`"{ContNat}"`].Libelle'.format(ContNat=ContNat), config['nomenclature_nature_chapitres'])
+    ContNat_Label = jmespath.search('[?Code==`"{ContNat}"`].Libelle'.format(ContNat=ContNat), config['nature_chapitres'])
     ContNat_Label = ContNat_Label[0] if len(ContNat_Label) else 'ERROR'
     Nature = jmespath.search('Nature.V', line) or 'ERROR'
     Nature_Label = jmespath.search('[?Code==`"{Nature}"`].Libelle'.format(Nature=Nature), config['nature_comptes'])
@@ -101,7 +103,8 @@ def line2csv(line, config):
     Fonction_Label = jmespath.search('[?Code==`"{Fonction}"`].Libelle'.format(Fonction=Fonction), config['fonction_references'])
     Fonction_Label = Fonction_Label[0] if len(Fonction_Label) else ''
     Operation = jmespath.search('Operation.V', line) or ''
-    Section_Code = jmespath.search('[?Code==`"{ContNat}"`].Section'.format(ContNat=ContNat), config['nomenclature_nature_chapitres'])
+    # Section_Code = jmespath.search('[?Code==`"{ContNat}"`].Section'.format(ContNat=ContNat), config['nomenclature_nature_chapitres'])
+    Section_Code = jmespath.search('[?Code==`"{ContNat}"`].Section'.format(ContNat=ContNat), config['nature_chapitres'])
     Section = codes_section[Section_Code[0]] if len(Section_Code) else 'ERROR'
     OpBudg_Code = jmespath.search('OpBudg.V', line) or 'ERROR'
     OpBudg = codes_opbudg[OpBudg_Code]
@@ -150,6 +153,12 @@ def line2csv(line, config):
     }
 
 
+def get_pdc(file):
+    with open(file, "r") as f:
+        pdc = json.load(f)
+    return pdc
+
+
 def totem2csv(xml_file):
     """Convert a Totem XML file to CSV according SCDL schema"""
     
@@ -161,23 +170,35 @@ def totem2csv(xml_file):
         xml = xf.read()
     xml_dict = xmltodict.parse(xml, attr_prefix='', force_list=False)
 
-    # Define plan de compte
-    year = jmespath.search(paths['budget_year'], xml_dict)
-    nomenclature = jmespath.search(paths['budget_nomenclature'], xml_dict).split('-', 1)
-    pdc_url = pdc_root_url + '/{year}/{n1}/{n2}/planDeCompte.xml'.format(year=year, n1=nomenclature[0], n2=nomenclature[1])
-    print("Plan de compte: {pdc_url}".format(pdc_url=pdc_url))
+    pdc = jmespath.search(paths['budget_nomenclature'], xml_dict).split('-', 1)[0]
 
-    # Get XML plan de compte from pdc_url
-    pdc_response = requests.get(pdc_url)
-    pdc_dict = xmltodict.parse(pdc_response.content, attr_prefix='', force_list=True)
+    # # Define plan de compte
+    # year = jmespath.search(paths['budget_year'], xml_dict)
+    # nomenclature = jmespath.search(paths['budget_nomenclature'], xml_dict).split('-', 1)
+    # pdc_url = pdc_root_url + '/{year}/{n1}/{n2}/planDeCompte.xml'.format(year=year, n1=nomenclature[0], n2=nomenclature[1])
+    # print("Plan de compte: {pdc_url}".format(pdc_url=pdc_url))
 
-    # Read plan de compte: get "nature chapitres", "nature comptes" et "fonction references"
-    nomenclature_nature_chapitres = jmespath.search(paths['nomenclature_nature_chapitres'], pdc_dict)
-    nomenclature_nature_comptes = jmespath.search(paths['nomenclature_nature_comptes'], pdc_dict)
-    nomenclature_fonction_references = jmespath.search(paths['nomenclature_fonction_references'], pdc_dict)
-    # Get "nature comptes" and "fonction references" lists
-    nature_comptes = get_children(nomenclature_nature_comptes, 'Compte', paths['comptes'], [])
-    fonction_references = get_children(nomenclature_fonction_references, 'RefFonc', paths['references_fonctionnelles'], [])
+    # # Get XML plan de compte from pdc_url
+    # pdc_response = requests.get(pdc_url)
+    # pdc_dict = xmltodict.parse(pdc_response.content, attr_prefix='', force_list=True)
+
+    # # Read plan de compte: get "nature chapitres", "nature comptes" et "fonction references"
+    # nomenclature_nature_chapitres = jmespath.search(paths['nomenclature_nature_chapitres'], pdc_dict)
+    # nomenclature_nature_comptes = jmespath.search(paths['nomenclature_nature_comptes'], pdc_dict)
+    # nomenclature_fonction_references = jmespath.search(paths['nomenclature_fonction_references'], pdc_dict)
+    # # Get "nature comptes" and "fonction references" lists
+    # nature_comptes = get_children(nomenclature_nature_comptes, 'Compte', paths['comptes'], [])
+    # fonction_references = get_children(nomenclature_fonction_references, 'RefFonc', paths['references_fonctionnelles'], [])
+
+    # nomenclature_nature_chapitres
+    # nomenclature_nature_comptes
+    # nomenclature_fonction_references
+
+    nature_chapitres = get_pdc('./pdc/' + pdc + '_nature_chapitres.json')
+    nature_comptes = get_pdc('./pdc/' + pdc + '_nature_comptes.json')
+    fonction_chapitres = get_pdc('./pdc/' + pdc + '_fonction_chapitres.json')
+    fonction_comptes = get_pdc('./pdc/' + pdc + '_fonction_comptes.json')
+    fonction_references = get_pdc('./pdc/' + pdc + '_fonction_references.json')
 
     # Get main informations from XML header
     NatDec_Code = jmespath.search(paths['budget_natdec'], xml_dict) or 'ERROR'
@@ -187,7 +208,8 @@ def totem2csv(xml_file):
     LibelleColl = jmespath.search(paths['budget_libellecoll'], xml_dict) or 'ERROR'
     
     config = {
-        'nomenclature_nature_chapitres': nomenclature_nature_chapitres,
+        # 'nomenclature_nature_chapitres': nomenclature_nature_chapitres,
+        'nature_chapitres': nature_chapitres,
         'nature_comptes': nature_comptes,
         'fonction_references': fonction_references,
         'NatDec': NatDec,
@@ -224,13 +246,18 @@ def main(argv):
     if len(argv) == 0:
         print('No XML file in command line')
         sys.exit()
+
+    if len(argv) == 1 and os.path.isdir(argv):
+        files = glob.glob(argv + '/**/*.xml', recursive=True)
+    else:
+        files = argv
     
-    for xml_file in argv:
-        if not os.path.isfile(xml_file) or not xml_file.endswith('.xml'):
-            print('File {xml_file} not exist or is not XML file'.format(xml_file=xml_file))
-            sys.exit()
-        
-        totem2csv(xml_file)
+    for file in files:
+        if not os.path.isfile(file) or not file.endswith('.xml'):
+            print('{file} not exist or is not XML file'.format(file=file))
+        else:
+            print('Convert {file} to CSV'.format(file=file))     
+            totem2csv(file)
 
 
 if __name__ == "__main__":