totem2csv/run.py

#!/usr/bin/env python3

'''Totem2XML
Script Python permettant de convertir une liste de fichiers de budget au format Totem XML, en fichiers CSV conformes au schéma national du SCDL pour les données budgétaires.

Utilisation:
> python run.py file1.xml [file2.xml file2.xml ... filen.xml]

Les fichiers générés sont localisés au même endroit que chaque fichier XML et aura le même nom.
Ainsi, "./mon/fichier1.xml" sera converti en "./mon/fichier1.csv".
'''

import sys
import os
import glob
import json
import csv
import xmltodict
import jmespath
import requests


paths = {
    'budget_libellecoll': 'DocumentBudgetaire.EnTeteDocBudgetaire.LibelleColl.V',
    'budget_nomenclature': 'DocumentBudgetaire.Budget.EnTeteBudget.Nomenclature.V',
    'budget_idetab': 'DocumentBudgetaire.Budget.EnTeteBudget.IdEtab.V',
    'budget_year': 'DocumentBudgetaire.Budget.BlocBudget.Exer.V',
    'budget_natdec': 'DocumentBudgetaire.Budget.BlocBudget.NatDec.V',
    'budget_lines': 'DocumentBudgetaire.Budget.LigneBudget[*]',

    'nomenclature_nature_chapitres': 'Nomenclature[0].Nature[0].Chapitres[0].Chapitre[*]',
    'nomenclature_nature_comptes': 'Nomenclature[0].Nature[0].Comptes[0].Compte[*]',
    'comptes': 'Compte[*]',

    'nomenclature_fonction_chapitres': 'Nomenclature[0].Fonction[0].Chapitres[0].Chapitre[*]',
    'nomenclature_fonction_references': 'Nomenclature[0].Fonction[0].RefFonctionnelles[0].RefFonc[*]',
    'references_fonctionnelles': 'RefFonc[*]',
}

codes_natdec = {
    '01': 'budget primitif',
    '02': 'décision modificative',
    '03': 'budget supplémentaire',
    '09': 'compte administratif',
    'ERROR': 'NetDec inconnu'
}

codes_section = {
    'I': 'investissement',
    'F': 'fonctionnement',
    'ERROR': 'ERROR'
}

codes_opbudg = {
    '0': 'réel',
    '1': 'ordre',
    'ERROR': 'ERROR'
}

codes_codrd = {
    'R': 'recette',
    'D': 'dépense',
    'ERROR': 'ERROR'
}

codes_artspe = {
    'false': 'non spécialisé',
    'true': 'spécialisé',
    'empty': ''
}

csv_header = ["BGT_NATDEC", "BGT_ANNEE", "BGT_SIRET", "BGT_NOM", "BGT_CONTNAT", "BGT_CONTNAT_LABEL", "BGT_NATURE", "BGT_NATURE_LABEL", "BGT_FONCTION", "BGT_FONCTION_LABEL", "BGT_OPERATION", "BGT_SECTION", "BGT_OPBUDG", "BGT_CODRD", "BGT_MTREAL", "BGT_MTBUDGPREC", "BGT_MTRARPREC", "BGT_MTPROPNOUV", "BGT_MTPREV", "BGT_CREDOUV", "BGT_MTRAR3112", "BGT_ARTSPE"]

# pdc_root_url = "http://odm-budgetaire.org/composants/normes"
pdc_directory = './pdc'


def get_children(parents, children_name, children_path, results):
    """Get all children from a dictonary and return a list"""

    for parent in parents:
        children = jmespath.search(children_path, parent)
        if children is not None:
            get_children(children, children_name, children_path, results)
            del parent[children_name]
            results.append(parent)
        else:
            results.append(parent)
    return results


def line2csv(line, config):
    "Convert line to CSV"

    ContNat = jmespath.search('ContNat.V', line) or 'ERROR'
    # ContNat_Label = jmespath.search('[?Code==`"{ContNat}"`].Libelle'.format(ContNat=ContNat), config['nomenclature_nature_chapitres'])
    ContNat_Label = jmespath.search('[?Code==`"{ContNat}"`].Libelle'.format(ContNat=ContNat), config['nature_chapitres'])
    ContNat_Label = ContNat_Label[0] if len(ContNat_Label) else 'ERROR'
    Nature = jmespath.search('Nature.V', line) or 'ERROR'
    Nature_Label = jmespath.search('[?Code==`"{Nature}"`].Libelle'.format(Nature=Nature), config['nature_comptes'])
    Nature_Label = Nature_Label[0] if len(Nature_Label) else 'ERROR'
    Fonction = jmespath.search('Fonction.V', line) or ''
    Fonction_Label = jmespath.search('[?Code==`"{Fonction}"`].Libelle'.format(Fonction=Fonction), config['fonction_references'])
    Fonction_Label = Fonction_Label[0] if len(Fonction_Label) else ''
    Operation = jmespath.search('Operation.V', line) or ''
    # Section_Code = jmespath.search('[?Code==`"{ContNat}"`].Section'.format(ContNat=ContNat), config['nomenclature_nature_chapitres'])
    Section_Code = jmespath.search('[?Code==`"{ContNat}"`].Section'.format(ContNat=ContNat), config['nature_chapitres'])
    Section = codes_section[Section_Code[0]] if len(Section_Code) else 'ERROR'
    OpBudg_Code = jmespath.search('OpBudg.V', line) or 'ERROR'
    OpBudg = codes_opbudg[OpBudg_Code]
    CodRD_Code = jmespath.search('CodRD.V', line) or 'ERROR'
    CodRD = codes_codrd[CodRD_Code]
    MtReal = jmespath.search('MtReal.V', line) or ''
    MtBudgPrec = jmespath.search('MtBudgPrec.V', line) or ''
    MtRARPrec = jmespath.search('MtRARPrec.V', line) or ''
    MtPropNouv = jmespath.search('MtPropNouv.V', line) or ''
    MtPrev = jmespath.search('MtPrev.V', line) or ''
    CredOuv = jmespath.search('CredOuv.V', line) or ''
    MtRAR3112 = jmespath.search('MtRAR3112.V', line) or ''
    ArtSpe_Code = jmespath.search('ArtSpe.V', line) or 'empty'
    ArtSpe = codes_artspe[ArtSpe_Code]

    # # Debug:
    # if Section == 'ERROR':
    #     print(ContNat)
    #     print(Section_Code)
    #     print(Section)

    return {
        'BGT_NATDEC': config['NatDec'],
        # 'BGT_NUMDEC': '',
        'BGT_ANNEE': config['Exer'],
        'BGT_SIRET': config['IdEtab'],
        'BGT_NOM': config['LibelleColl'],
        'BGT_CONTNAT': ContNat,
        'BGT_CONTNAT_LABEL': ContNat_Label,
        'BGT_NATURE': Nature,
        'BGT_NATURE_LABEL': Nature_Label,
        'BGT_FONCTION': Fonction,
        'BGT_FONCTION_LABEL': Fonction_Label,
        'BGT_OPERATION': Operation,
        'BGT_SECTION': Section,
        'BGT_OPBUDG': OpBudg,
        'BGT_CODRD': CodRD,
        'BGT_MTREAL': MtReal,
        'BGT_MTBUDGPREC': MtBudgPrec,
        'BGT_MTRARPREC': MtRARPrec,
        'BGT_MTPROPNOUV': MtPropNouv,
        'BGT_MTPREV': MtPrev,
        'BGT_CREDOUV': CredOuv,
        'BGT_MTRAR3112': MtRAR3112,
        'BGT_ARTSPE': ArtSpe,
    }


def get_pdc(file):
    with open(file, "r") as f:
        pdc = json.load(f)
    return pdc


def totem2csv(xml_file):
    """Convert a Totem XML file to CSV according SCDL schema"""

    filename = xml_file[:-4]
    csv_file = filename + '.csv'

    # Read XML file
    with open(xml_file, 'r') as xf:
        xml = xf.read()
    xml_dict = xmltodict.parse(xml, attr_prefix='', force_list=False)

    pdc = jmespath.search(paths['budget_nomenclature'], xml_dict).split('-', 1)[0]

    # # Define plan de compte
    # year = jmespath.search(paths['budget_year'], xml_dict)
    # nomenclature = jmespath.search(paths['budget_nomenclature'], xml_dict).split('-', 1)
    # pdc_url = pdc_root_url + '/{year}/{n1}/{n2}/planDeCompte.xml'.format(year=year, n1=nomenclature[0], n2=nomenclature[1])
    # print("Plan de compte: {pdc_url}".format(pdc_url=pdc_url))

    # # Get XML plan de compte from pdc_url
    # pdc_response = requests.get(pdc_url)
    # pdc_dict = xmltodict.parse(pdc_response.content, attr_prefix='', force_list=True)

    # # Read plan de compte: get "nature chapitres", "nature comptes" et "fonction references"
    # nomenclature_nature_chapitres = jmespath.search(paths['nomenclature_nature_chapitres'], pdc_dict)
    # nomenclature_nature_comptes = jmespath.search(paths['nomenclature_nature_comptes'], pdc_dict)
    # nomenclature_fonction_references = jmespath.search(paths['nomenclature_fonction_references'], pdc_dict)
    # # Get "nature comptes" and "fonction references" lists
    # nature_comptes = get_children(nomenclature_nature_comptes, 'Compte', paths['comptes'], [])
    # fonction_references = get_children(nomenclature_fonction_references, 'RefFonc', paths['references_fonctionnelles'], [])

    # nomenclature_nature_chapitres
    # nomenclature_nature_comptes
    # nomenclature_fonction_references

    nature_chapitres = get_pdc('./pdc/' + pdc + '_nature_chapitres.json')
    nature_comptes = get_pdc('./pdc/' + pdc + '_nature_comptes.json')
    fonction_chapitres = get_pdc('./pdc/' + pdc + '_fonction_chapitres.json')
    fonction_comptes = get_pdc('./pdc/' + pdc + '_fonction_comptes.json')
    fonction_references = get_pdc('./pdc/' + pdc + '_fonction_references.json')

    # Get main informations from XML header
    NatDec_Code = jmespath.search(paths['budget_natdec'], xml_dict) or 'ERROR'
    NatDec = codes_natdec[NatDec_Code]
    Exer = jmespath.search(paths['budget_year'], xml_dict) or 'ERROR'
    IdEtab = jmespath.search(paths['budget_idetab'], xml_dict) or 'ERROR'
    LibelleColl = jmespath.search(paths['budget_libellecoll'], xml_dict) or 'ERROR'

    config = {
        # 'nomenclature_nature_chapitres': nomenclature_nature_chapitres,
        'nature_chapitres': nature_chapitres,
        'nature_comptes': nature_comptes,
        'fonction_references': fonction_references,
        'NatDec': NatDec,
        'Exer': Exer,
        'IdEtab': IdEtab,
        'LibelleColl': LibelleColl,
    }

    # Get budget lines
    budget_lines = jmespath.search(paths['budget_lines'], xml_dict)
    nb_xml_lines = len(budget_lines)
    print("{nb_xml_lines} lignes de budget dans le fichier {xml_file}.".format(nb_xml_lines=nb_xml_lines, xml_file=xml_file))

    # Open CSV file to save lines
    with open(csv_file, 'w', newline='', encoding="utf-8") as cf:
        csv_writer = csv.DictWriter(cf, fieldnames=csv_header, delimiter=',', quoting=csv.QUOTE_NONNUMERIC)
        csv_writer.writeheader()
        nb_csv_lines = 0

        # Generate and save CSV line
        # Cf. process details in https://gitlab.com/datafin/totem/-/blob/master/totem2csv/xsl/totem2xmlcsv.xsl
        for id_line, line in enumerate(budget_lines):

            nb_csv_lines += 1
            print('Traitement et enregistrement de la ligne {id_line}/{nb_xml_lines}'.format(id_line=id_line+1, nb_xml_lines=nb_xml_lines))

            csv_line = line2csv(line, config)
            csv_writer.writerow(csv_line)

    print("{nb_csv_lines} lignes enregistrées dans le fichier {csv_file} sur {nb_xml_lines}.".format(nb_csv_lines=nb_csv_lines, csv_file=csv_file, nb_xml_lines=nb_xml_lines))


def main(argv):
    if len(argv) == 0:
        print('No XML file in command line')
        sys.exit()

    if len(argv) == 1 and os.path.isdir(argv):
        files = glob.glob(argv + '/**/*.xml', recursive=True)
    else:
        files = argv

    for file in files:
        if not os.path.isfile(file) or not file.endswith('.xml'):
            print('{file} not exist or is not XML file'.format(file=file))
        else:
            print('Convert {file} to CSV'.format(file=file))
            totem2csv(file)


if __name__ == "__main__":
   main(sys.argv[1:])