sdi-checker/sdi_checker/libs/sdi_consistence_check/GeoserverToGeonetworkUpdater.py
Guillaume RYCKELYNCK c6c2ee9c04 first commit
2024-06-22 22:00:42 +02:00

326 lines
15 KiB
Python

#
# scénario 3: Synchronisation Geoserver vers GeoNetwork
#
# en entrée: nom d'espace de travail GS (et url du GeoServer)
# Parcours des couches de l'espace de travail
# Si aucune metadataURL, remonter la couche en erreur
# Sinon, aller sur GN et récupérer la MDD, puis les MDS du workspace
# Si celles-ci n'existent pas, les créer.
#
# Problemes:
#
# * la MDS risque d'etre celle globale à tout le GeoServer, le cahier des charges prévoit
# de créer 2 métadonnées de service par espace de travail (une pour WMS et une pour WFS). Cela risque de donner
# deux fiches quasiment identiques, mais aussi quasiment vides. (les seules infos que l'on ait sur un workspace coté
# geoserver étant le nom et son URL de namespace).
#
# * Il faut pouvoir discrimer ces nouvelles MDs afin de les retrouver par lancement successifs. Ce n'est pas vraiment
# faisable en l'état sans adopter une convention. Disons que l'URL du service doit etre de la forme (commencer par,
# sans respecter la casse):
# - http://url-geoserver/geoserver/nom-workspace/ows?service=wms pour wms
# - http://url-geoserver/geoserver/nom-workspace/ows?service=wfs pour wfs
#
import argparse
import logging
import re
import sys
import uuid
import warnings
import xml.etree.ElementTree as ET
from time import localtime
from time import strftime
from geoserver.catalog import Catalog
from mako.template import Template
from owslib.csw import CatalogueServiceWeb
from requests.exceptions import SSLError
from GeonetworkToGeoserverUpdater import print_report
from bypassSSLVerification import bypassSSLVerification
from credentials import Credentials
from check_csw import CSWQuerier
from inconsistency import GsToGnUnableToCreateServiceMetadataInconsistency, Inconsistency, \
GsToGnUnableToUpdateServiceMetadataInconsistency
from utils import find_data_metadata, print_report, load_workspaces_mapping
def init_mdd_mds_mapping(cswQuerier):
mds = cswQuerier.get_service_mds(constraints=[CSWQuerier.non_harvested])
mdd_to_mds = {}
for mduuid, md in mds.items():
for mdd in md.serviceidentification.operateson:
uuidref = mdd["uuidref"]
if mdd_to_mds.get(uuidref, None) is None:
mdd_to_mds[uuidref] = []
mdd_to_mds[uuidref].append(mduuid)
return mdd_to_mds
# Logging configuration
logger = logging.getLogger("GsToGnUpdater")
out_hdlr = logging.StreamHandler(sys.stdout)
out_hdlr.setLevel(logging.INFO)
logger.addHandler(out_hdlr)
logger.setLevel(logging.INFO)
# these variables are global to have a hand easily on cached resources obtained remotely
# TODO: define
def print_banner(args):
"""
Show the banner
:param args: the argument object passed to the script.
:return: nothing.
"""
logger.info("\nGeoserver To Geonetwork Updater\n\n")
logger.info("Workspace to query: %s", args.workspace)
logger.info("GeoServer: %s", args.geoserver)
logger.info("GeoNetwork where to insert created metadata: %s", args.geonetwork)
logger.info("Dry-run: %s", args.dry_run)
logger.info("\nStart time: %s", strftime("%Y-%m-%d %H:%M:%S", localtime()))
logger.info("\n\n")
def workspace_service_url(gs_url, workspace, service):
return "%s/%s/ows?service=%s" % (gs_url, workspace, service)
def guess_related_service_metadata(gs_url, gn_url, workspace, service):
"""
Retrieves the associated metadata from the same catalog (assuming it is a GeoNetwork)
:param md: the parsed metadata
:param gs_url: the GeoServer base URL
:param gn_url: the catalogue base URL
:param workspace: the workspace name
:param service: the targeted service (WMS or WFS)
:return the service metadata related to the service, if found
"""
gn_cswurl = "%s/srv/eng/csw" % gn_url
csw_q = CSWQuerier(gn_cswurl)
mds = csw_q.get_all_records(constraints=[And([csw_q.is_service, csw_q.non_harvested])])
expected_service_url = workspace_service_url(gs_url, workspace, service)
for smd_uuid, smd in mds.items():
try:
if smd.distribution.online[0].url == expected_service_url:
return smd
except:
continue
return None
def create_service_metadata_from_template(data, service_type):
"""
Returns a string suitable for a service metadata creation via CSW-T insert operation.
:param data: a hashmap which has the following format:
{
'file_identifier': md_uuid,
'current_date': "YYY-mm-dd",
'service_name': the name of the GS workspace,
'service_url': the service URL (see remark in comment above, should be used carefully
as identifier to get the MD),
'current_datetime': "YYYY-mm-dd HH:mm",
'abstract': abstract for the metadata,
'title': a title for the metadata,
'layers': [
'mdd_uuid': the uuid of the data metadata,
'mdd_url': the URL of the data metadata,
'name': the layer name
]
}
:param service_type: the service type ('wms' or 'wfs')
:return: a string representing the XML service metadata
"""
return Template(filename="template/service-metadata-%s.xml" % (service_type)).render(**data)
def insert_metadata(gn_url, workspace, record, credentials=Credentials()):
"""
Inserts a metadata in a catalogue, given its base URL.
:param gn_url: the base URL to the GeoNetwork.
:param workspace: the workspace name queried onto the GeoServer
:param record: the metadata as string to be inserted
:param credentials: the credentials object
:return: True if success, raises an exception otherwise.
"""
try:
(username, password) = credentials.getFromUrl(gn_url)
csw = CatalogueServiceWeb(gn_url + "/srv/eng/csw-publication", username=username, password=password)
csw.transaction(ttype="insert", typename="gmd:MD_Metadata", record=record)
return True
except Exception as e:
logger.error(e)
raise GsToGnUnableToCreateServiceMetadataInconsistency(workspace, gn_url, e)
def update_metadata(gn_url, uuid, record, credentials=Credentials()):
"""
Updates a metadata in the catalogue, given its UUID
:param gn_url the GeoNetwork base url
:param uuid the unique identifier of the metadata to be updated
:param record the XML as string for the updated metadata
:param credentials the Credentials object
:return: True if success, raises an exception otherwise
"""
try:
(username, password) = credentials.getFromUrl(gn_url)
csw = CatalogueServiceWeb(gn_url + "/srv/eng/csw-publication", username=username, password=password)
csw.transaction(ttype="update", typename="gmd:MD_Metadata", record=record, identifier=uuid)
return True
except Exception as e:
logger.error(e)
raise GsToGnUnableToUpdateServiceMetadataInconsistency(workspace, uuid, gn_url, e)
def add_operates_on(xmlmd, md_url, uuidref):
# /gmd:MD_Metadata/gmd:identificationInfo/srv:SV_ServiceIdentification/srv:operatesOn
parsed_md = ET.fromstring(xmlmd)
operates_on_elem = ET.Element('{http://www.isotc211.org/2005/srv}operatesOn')
operates_on_elem.attrib["uuidref"] = uuidref
operates_on_elem.attrib["{http://www.w3.org/1999/xlink}href"] = md_url
parsed_md.find(".//{http://www.isotc211.org/2005/srv}SV_ServiceIdentification").append(operates_on_elem)
return ET.tostring(parsed_md)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--workspace", help="indicates the GeoServer workspace name.", required=True)
parser.add_argument("--geoserver", help="the GeoServer URL to use (e.g. 'http://localhost:8080/geoserver').", required=True)
parser.add_argument("--geonetwork", help="the Geonetwork URL where the created metadatas have to be stored"
" (e.g. 'http://localhost:8080/geonetwork').", required=True)
parser.add_argument("--service", help="The service ('wfs' or 'wms')", choices=['wms', 'wfs'])
parser.add_argument("--dry-run", help="Dry-run mode", action='store_true', default=False)
parser.add_argument("--disable-ssl-verification", help="Disable certificate verification", action="store_true")
parser.add_argument("--workspaces-mapping", help="the INI file to be loaded to resolve title and abstract on "
"created service metadata",
default="template/workspaces-mapping.ini.example")
args = parser.parse_args(sys.argv[1:])
if (args.workspace is None or args.geoserver is None or
args.geonetwork is None or args.service not in ["wfs", "wms"]):
parser.print_help()
sys.exit()
print_banner(args)
if args.disable_ssl_verification:
bypassSSLVerification()
# Disable FutureWarning from owslib
warnings.simplefilter("ignore", category=FutureWarning)
# Load credentials
creds = Credentials(logger=logger)
(user, password) = creds.getFromUrl(args.geoserver)
# Load the mapping file
try:
workspaces_mappings = load_workspaces_mapping(args.workspaces_mapping)
except Exception as e:
logger.error("Unable to load the mapping file for title/abstract.", e)
sys.exit(1)
gscatalog = Catalog(args.geoserver + "/rest/", username=user, password=password)
errors = []
try:
workspace = gscatalog.get_workspace(name=args.workspace)
except SSLError as e:
logger.error("Unable to connect: SSL error (hint: use --disable-ssl-verification option)")
sys.exit(1)
if workspace is None:
logger.error("Workspace \"%s\" not found" % args.workspace)
sys.exit(1)
else:
resources = gscatalog.get_resources(workspace=workspace)
for res in resources:
try:
# UUID from MDD is needed for operatesOn elements
md_url, md = find_data_metadata(res, creds, args.disable_ssl_verification)
layer = gscatalog.get_layer(name="%s:%s" % (res.workspace.name, res.name))
linked_md = guess_related_service_metadata(args.geoserver, args.geonetwork,
args.workspace, args.service)
if linked_md is None:
# Creates a new service metadata for the workspace
logger.info("No service metadata found for %s, creating one", args.workspace)
title = args.workspace
abstract = "Métadonnée de service décrivant " \
"l'espace de travail \"%s\" (%s)" % (args.workspace, args.service)
try:
title = workspaces_mappings[args.workspace]['title_' + args.service]
abstract = workspaces_mappings[args.workspace]['abstract_' + args.service]
except Exception as e:
logger.error("Unable to get a title or an abstract for created metadata, using default values")
data = {
'file_identifier': uuid.uuid4(),
'current_date': strftime("%Y-%m-%d", localtime()),
'service_name': args.workspace,
'service_url': workspace_service_url(args.geoserver, args.workspace, args.service),
'current_datetime': strftime("%Y-%m-%d %H:%M:%S", localtime()),
'abstract': abstract,
'title': title,
'layers': []
}
# reiterates on the resources object, to first create the complete service metadata
# (with all the linked data MD).
# this should not harm since we don't touch to the array, and access it only in a read-only mode
for r2 in resources:
try:
r2mdurl, r2md = find_data_metadata(r2, creds, args.disable_ssl_verification)
data['layers'].append({
'mdd_uuid': r2md.identifier,
'mdd_url': r2mdurl,
'name': r2.name,
'title': r2.title
})
except:
logger.error("Unable to find all the data metadata for some layers on workspace '%s',"
" generated service metadata might be incomplete.", args.workspace)
logger.error("Layer with missing data metadata URL: '%s:%s'", args.workspace,
r2.name)
new_srv_md = create_service_metadata_from_template(data, args.service)
if not args.dry_run:
insert_metadata(args.geonetwork, args.workspace, new_srv_md, creds)
else:
logger.info("Dry-run: would have created a service metadata for workspace '%s'"
" and metadata '%s'", args.workspace, md.identifier)
else:
# service metadata found for the current workspace
# we still have to check if the mds references (operatesOn)
# all the MDD defined in the layers' workspace.
operateson_found = False
for oon in linked_md.serviceidentification.operateson:
if oon['uuidref'] == md.identifier:
operateson_found = True
break
if not operateson_found:
logger.info("MDD '%s' is not referenced into the service MD '%s' (on server '%s'). "
"Adding the operatesOn link", md_url, linked_md.identifier, args.geonetwork)
xml_mds = add_operates_on(linked_md.xml, md_url, md.identifier)
if not args.dry_run:
update_metadata(args.geonetwork, linked_md.identifier, xml_mds, creds)
else:
logger.info("Dry-run: would have updated md %s [adding operatesOn on data md %s]",
linked_md.identifier, md.identifier)
# TODO: also check / fix SV_CoupledResource ?
except Inconsistency as e:
errors.append(e)
# cswquerier object will depend on the MD URL obtained from the GeoServer.
# We can imagine having more than one catalogue instance, then we need
# to find a way to keep the MDD to MDS mapping for each of these catalogues.
#
#csw_q = None
#servicesmd = csw_q.get_all_records(constraints=[And([csw_q.is_service, csw_q.non_harvested])])
#data_to_service_map = {}
#for uuid, md in servicesmd.items():
# for oon in md.identificationinfo[0].operateson:
# if data_to_service_map.get(oon['uuidref']) is None:
# data_to_service_map[oon['uuidref']] = [uuid]
# else:
# data_to_service_map[oon['uuidref']] = data_to_service_map[oon['uuidref']] + [uuid]
print_report(logger, errors)