179 lines
7.2 KiB
Python
179 lines
7.2 KiB
Python
import logging
|
|
import re
|
|
import warnings
|
|
import xml.etree.ElementTree as ET
|
|
from contextlib import suppress
|
|
from urllib.parse import urlparse
|
|
|
|
from owslib import namespaces
|
|
from owslib.csw import CatalogueServiceWeb
|
|
from owslib.fes import PropertyIsEqualTo, Not, Or, And
|
|
from owslib.util import ServiceException
|
|
|
|
from sdi_cc_check.libs.sdi_consistence_check.credentials import Credentials
|
|
from sdi_cc_check.libs.sdi_consistence_check.inconsistency import Inconsistency, GnToGsNoGetCapabilitiesUrl
|
|
from sdi_cc_check.libs.sdi_consistence_check.check_ows import CachedOwsServices
|
|
|
|
|
|
class CSWQuerier:
|
|
max_records = 100
|
|
is_dataset = PropertyIsEqualTo("Type", "dataset")
|
|
is_service = PropertyIsEqualTo("Type", "service")
|
|
non_harvested = PropertyIsEqualTo("isHarvested", "false")
|
|
|
|
protocol_regexp = re.compile("^OGC:(?P<type>WMS|WFS)(?:-(?P<version>\d+(?:\.\d+)*)(?:-[\w-]+)?)?$", re.IGNORECASE)
|
|
|
|
def __init__(self, url, credentials=Credentials(),
|
|
cached_ows_services=None, logger=None, timeout=30):
|
|
(username, password) = credentials.getFromUrl(url)
|
|
if logger is not None:
|
|
self.logger = logger
|
|
else:
|
|
self.logger = logging.getLogger("cswquerier")
|
|
self.logger.addHandler(logging.NullHandler())
|
|
self.owsServices = cached_ows_services or CachedOwsServices(credentials=credentials, timeout=timeout)
|
|
try:
|
|
self.csw = CatalogueServiceWeb(url, username=username, password=password)
|
|
except Exception as ex:
|
|
raise ServiceException(ex)
|
|
self.mds_not_parsable = []
|
|
self.reset()
|
|
|
|
def reset(self):
|
|
self.start = 0
|
|
self.md_count = -1
|
|
|
|
def get_dataset_records(self, constraints=[]):
|
|
self.csw.getrecords2(
|
|
constraints=[And(constraints + [self.is_dataset])] if constraints else [self.is_dataset],
|
|
esn='full',
|
|
startposition=self.start,
|
|
maxrecords=self.max_records,
|
|
)
|
|
self.logger.debug(
|
|
"CSWQuerier.get_records() results : %s (start=%s, max=%s)",
|
|
self.csw.results,
|
|
self.start,
|
|
self.max_records,
|
|
)
|
|
self.start += self.csw.results['returned']
|
|
return self.csw.records
|
|
|
|
def get_md(self, uuid):
|
|
return self.csw.records[uuid]
|
|
|
|
def get_service_mds(self, constraints=[]):
|
|
# do not take care of FutureWarnings issued by OWSLib
|
|
with warnings.catch_warnings():
|
|
self.csw.getrecords2(
|
|
constraints=[And(constraints + [self.is_dataset])] if constraints else [self.is_service],
|
|
esn='full',
|
|
outputschema=namespaces['gmd'],
|
|
startposition=0,
|
|
maxrecords=1000000,
|
|
)
|
|
return self.csw.records
|
|
|
|
|
|
def get_data_mds(self, constraints=[]):
|
|
self.csw.getrecords2(
|
|
constraints=[And(constraints + [self.is_dataset])] if constraints else [self.is_dataset],
|
|
esn='full',
|
|
outputschema=namespaces['gmd'],
|
|
startposition=0,
|
|
maxrecords=1000000,
|
|
)
|
|
return self.csw.records
|
|
|
|
def get_all_records(self, constraints=[]):
|
|
"""
|
|
Gets all records, also managing the pagination against the remote CSW server.
|
|
:param constraint: the constraint array to be passed to OWSLib getrecords2.
|
|
:return: a hashmap with UUID as key, the parsed metadata as value.
|
|
"""
|
|
startpos = 0
|
|
mds = {}
|
|
while True:
|
|
self.csw.getrecords2(
|
|
constraints,
|
|
esn='full',
|
|
outputschema=namespaces['gmd'],
|
|
startposition=startpos,
|
|
maxrecords=self.max_records,
|
|
)
|
|
for uuid in self.csw.records:
|
|
mds[uuid] = self.csw.records[uuid]
|
|
startpos = len(mds) + 1
|
|
if startpos > self.csw.results['matches']:
|
|
break
|
|
return mds
|
|
|
|
def check_service_md(self, mds, mdd, geoserver_to_check=[]):
|
|
warnings.simplefilter("ignore")
|
|
|
|
# check if this is an interesting service md (contains "coupledResource" or "operatesOn" tag)
|
|
if len(mds.serviceidentification.operateson) == 0:
|
|
# raise error ?
|
|
return
|
|
|
|
self.logger.info("\nData metadata: uuid %s \"%s\"", mdd.identifier, mdd.identification.title)
|
|
self.logger.info("Service metadata: uuid %s \"%s\"", mds.identifier, mds.identification.title)
|
|
|
|
# retrieve geoserver base URL (getCapabilities)
|
|
url = None
|
|
for op in mds.serviceidentification.operations:
|
|
if op['name'] == "GetCapabilities":
|
|
url = op['connectpoint'][0].url
|
|
protocol = op['connectpoint'][0].protocol
|
|
|
|
if url is None:
|
|
raise GnToGsNoGetCapabilitiesUrl(mds.identifier, mdd.identifier)
|
|
|
|
url_parsed = urlparse(url)
|
|
if url_parsed.hostname not in geoserver_to_check:
|
|
self.logger.info("\tSkipping : geoserver : %s not in authorized list (%s)" % (url_parsed.hostname, url))
|
|
# do not raise error, since the underlying WxS server is not in the list of checked geoserver
|
|
return
|
|
|
|
# try to read protocol
|
|
# TODO: which protocol to check ? is there a convention at Rennes-metropole on this ?
|
|
matches = self.protocol_regexp.match(protocol)
|
|
if matches is None:
|
|
self.logger.error("Invalid protocol : %s " % protocol)
|
|
# raise error ?
|
|
return
|
|
|
|
type = matches.group("type")
|
|
version = matches.group("version")
|
|
self.logger.debug("Server Type: %s Version: %s URL: %s" % (type, version, url))
|
|
|
|
root = ET.fromstring(mds.xml.decode())
|
|
xpath = ".//{http://www.isotc211.org/2005/srv}coupledResource"
|
|
# xpath = ".//{http://www.isotc211.org/2005/srv}operatesOn"
|
|
res = root.findall(xpath)
|
|
for r in res:
|
|
operation_name = identifier = layer_name = None
|
|
with suppress(AttributeError):
|
|
operation_name = r.find(
|
|
".//{http://www.isotc211.org/2005/srv}operationName/{http://www.isotc211.org/2005/gco}CharacterString").text
|
|
identifier = r.find(
|
|
".//{http://www.isotc211.org/2005/srv}identifier/{http://www.isotc211.org/2005/gco}CharacterString").text
|
|
layer_name = r.find(".//{http://www.isotc211.org/2005/gco}ScopedName").text
|
|
if identifier is not None and layer_name is not None:
|
|
self.logger.debug("\tcoupledRessources:")
|
|
self.logger.debug("\tOperation : %s" % operation_name)
|
|
self.logger.debug("\tidentifier : %s" % identifier)
|
|
self.logger.debug("\tLayer Name: %s" % layer_name)
|
|
self.logger.debug("")
|
|
|
|
try:
|
|
if type.lower() == "wms":
|
|
self.owsServices.checkWmsLayer(url, layer_name)
|
|
elif type.lower() == "wfs":
|
|
self.owsServices.checkWfsLayer(url, layer_name)
|
|
else:
|
|
raise Inconsistency("Invalid service type : %s" % type)
|
|
except Inconsistency as e:
|
|
e.md_uuid = mds.identifier
|
|
raise e
|