Converting FGDC XML metadata records for WMS and WFS services to ISO 19139 via Python and GeoNetwork

This post describes a series of Python scripts to convert FGDC XML metadata for WMS and WFS services to ISO 19139 service metadata via GeoNetwork's OGC harvest service. Here are the steps:

  1. Extract WMS and WFS GetCapabilities URLs from FGDC XML metadata and write them to a file.
  2. Create a GeoNetwork Harvest Node from extracted GetCapabilities URLs and let GeoNetwork's OGC WMS/WFS harvester create ISO 19139 metadata from GetCapabilities response. Note: the resulting ISO 19139 metadata is only as good as the one in the GetCapabilities response. In addition, GetCapabilities responses do not include all fields necessary for minimum ISO 19139 metadata.
  3. Copy some FGDC metadata entries (Title, Abstract, etc.) to newly created ISO 19139 metadata.

Our goal is to have working WMS and WFS metadata records in our CSW catalog that can be used to add the described services to analytical software such as ESRI's C-SW Client for ArcGIS Desktop. Following are the reasons why I am currently choosing this convoluted process instead of a direct FGDC to ISO 19139 metadata conversion:

  1. Harvesting the metadata from GetCapabilities responses ensures that the service actually exists and that the resulting ISO 19139 service metadata records conform to some convention - in this case GeoNetwork's.
  2. The source FGDC XML metadata I am working with is sometimes not well formed and almost always not schema conform. More importantly, there seems no standardized vocabulary to describe OGC services in FGDC.
  3. I have not seen any FGDC service metadata to ISO 19139 conversion efforts and my own FGDC XML dataset metadata records to ISO 191139 conversion experience was not very satisfying.

Extract GetCapabilities URLs from FGDC XML metadat

#!/usr/bin/env python2.6
# this is gn_hn_ogc_create_list.py

"""
Extract WMS & WFS GetCapabilities URLs from
FGDC metadata records for OGC services in perparation
of GeoNetwork OGC Harvest Node creation.

Creates a space-delimited file with following information:
<WMS or WFS><WxS version><space><WMS or WFS GetCapabilities URL>

Works with WMS and WFS GetCapabilities URLS that include a
"version=" query string attribute.
"""

# Library Imports
#import this
import os
import cProfile
from xml.dom import minidom
from time import strftime

# module globals and constants
__author__ = 'Wolfgang Grunberg'
__credits__ = ['Arizona Geological Survey', 'Wolfgang Grunberg', 'the Internets']
__email__ = 'wgrunberg@azgs.az.gov'
__status__ = 'Prototype' # "Prototype", "Development", or "Production"
__version__ = '0.2'
__date__ = '2010/03/10'

# constants
FGDC_XML_DIR_PATH = "C:\\temp\\fgdc_source\\"
OGC_URL_OUT_FILE = "C:\\temp\\OGC_GetCapabilities_URLs.txt"


def uniqify_list(seq, idfun=None):
"""
Fast way to uniqify a list
By Peter Bengtssom
http://www.peterbe.com/plog/uniqifiers-benchmark
"""
# order preserving
if idfun is None:
def idfun(x): return x
seen = {}
result = []
for item in seq:
marker = idfun(item)
# in old Python versions:
# if seen.has_key(marker)
# but in new ones:
if marker in seen: continue
seen[marker] = 1
result.append(item)
return result


def contains(string, sub_string):
"""
Checks if a string is found in an other string
"""
return filter(lambda i: string[i] == sub_string, range(0, len(string)))


def extract_url_list(dir_path = ""):
"""
Extract OGC get capabilities from FGDC XML metadata
"""
print "***** START Extract OGC GetCapabilities URL "+strftime("%Y-%m-%d %H:%M:%S")+" *****"

# Variables
url_list = []

# get file names from FGDC directory
try:
directory = os.listdir(dir_path)
#print directory #debug
except os.error:
print " EXCEPTION: FGDC XML metadata directory does not exits ("+dir_path+") "
return url_list

for file_name in directory:
#print file_name #debug
source_file_path = dir_path+file_name

#
with open(source_file_path, 'r') as fgdc_file:
fgdc_metadata = fgdc_file.read()
#print ' Reading metadata file: '+file_name
#print fgdc_metadata #debug

# Load FGDC metadata string into XML object
xmldoc = minidom.parseString(fgdc_metadata)
#print xmldoc.toxml() #debug

try:
for node_digform in xmldoc.getElementsByTagName('digform'): # FGDC digital form of a standard order
#print node_digform.firstChild.nodeValue # debug
for node_networkr in node_digform.getElementsByTagName('networkr'): # FGDC Network Resource
#print node_networkr.firstChild.nodeValue # debug
url = node_networkr.firstChild.nodeValue
# extract WMS and WFS URLs
if 'service=wms' in url.lower():
#print url # debug
url_list.append(url)
elif 'service=wfs' in url.lower():
#print url # debug
url_list.append(url)
except:
print ' EXCEPTION: Is not a FGDC XML service metadat file:'+file_name
return url_list
xmldoc.unlink() # cleanup DOM for improved performance
fgdc_file.close()

# Sort URL list while ignoring case
url_list.sort(key=lambda x: x.lower())

# Uniqify the URL list while respecting case
url_list = uniqify_list(url_list)

print "***** END Extract OGC GetCapabilities URL "+strftime("%Y-%m-%d %H:%M:%S")+" *****"
return url_list


def create_url_file(url_list = None, file_path = ''):
"""
Create OGC URL file from list
"""
print "***** START Create OGC URL file from list "+strftime("%Y-%m-%d %H:%M:%S")+" *****"

# "Do not use mutable objects as default values in the function or method definition."
if url_list is None:
url_list = []

# Variables
file_content = ''

# Construct space delimited string of GeoNetwork OGC type/version codes and GetCapabilities URL
for url in url_list:
#print url # debug
# Extract WMS/WFS service and version
service = url.split('service=')[1][:3]
version = url.split('version=')[1][:5]

file_content += service.upper()+version+' '+url+'\n'
#print file_content # debug

# cretae URL file
try:
url_file = open(file_path,'w')
url_file.write(file_content)
url_file.close()
except IOError:
print " EXCEPTION: failed to write "+file_path
return False

print "***** END Create OGC URL file from list "+strftime("%Y-%m-%d %H:%M:%S")+" *****"
return True


def et():
"""
Extract and transform OGC URLS from FGDC metadata files
"""

url_list = extract_url_list(FGDC_XML_DIR_PATH)
#print list # debug

create_url_file(url_list, OGC_URL_OUT_FILE)


if __name__ == "__main__":
et()
#cProfile.run('etl()') # execution performance information

 

Create and execute GeoNetwork Harvest Node for WMS/WFS services

This script extracts OGC GetCapabilities URLS from file, transforms them to GeoNetwork's Create OGC Harvest Node request, executes the request, and runs the GeoNetwork GetCapabilities to ISO 19139 metadata harvest. This scripts reads in a file (OGC_GetCapabilities_URLs.txt) with an OGC service type/version key and a GetCapabilities URL value such as:

WMS1.1.1 http://mrdata.usgs.gov/ArcGIS/services/Geophysics/NorthAmericaCompilation_static/MapServer/WMSServer?request=getcapabilities&service=WMS&version=1.1.1&
WFS1.0.0 http://mrdata.usgs.gov/cgi-bin/mapserv?map=active-mines.map&request=getcapabilities&service=WFS&version=1.0.0&
WMS1.1.1 http://mrdata.usgs.gov/cgi-bin/mapserv?map=active-mines.map&request=getcapabilities&service=WMS&version=1.1.1&
WFS1.0.0 http://mrdata.usgs.gov/cgi-bin/mapserv?map=akages-geo.map&request=getcapabilities&service=WFS&version=1.0.0&
WMS1.1.1 http://mrdata.usgs.gov/cgi-bin/mapserv?map=akages-geo.map&request=getcapabilities&service=WMS&version=1.1.1&

Read and modify the Python script before executing it. This script has been tested with GeoNetwork 2.4. See Create a GeoNetwork OGC Harvest Node (WMS or WFS GetCapabilities to ISO 19139 metadata) through xml.harvesting.add request for an introduction to generating GeoNetwork Harvest Node requests.
Updated: Version 0.4 now saves the freshly created Harvest Nodes' IDs to file (GeoNetwork_HarvestNode_IDs.txt).

#!/usr/bin/env python2.6
# this is gn_hn_create_OGC_harvestnode.py

"""
Create GeoNetwork Harvest Node for OGC WMS and WFS services.

Requires a space-delimited file with following information:
<WMS or WFS><WxS version><space><WMS or WFS GetCapabilities URL>
"""

# Library Imports
#import this
import urllib
import urllib2
import cProfile
import cookielib
import json
import re # Regular Expression
from xml.dom import minidom
from time import strftime
from xml.sax.saxutils import escape # used to convert the <, &, and > characters to the corresponding entity references

# module globals and constants
__author__ = 'Wolfgang Grunberg'
__credits__ = ['Arizona Geological Survey', 'Wolfgang Grunberg', 'the Internets']
__email__ = 'wgrunberg@azgs.az.gov'
__status__ = 'Prototype' # "Prototype", "Development", or "Production"
__version__ = '0.4'
__date__ = '2010/03/10'

# Constants
OGC_URL_IN_FILE = "C:\\temp\\OGC_GetCapabilities_URLs.txt"
HN_ID_FILE = "C:\\temp\\GeoNetwork_HarvestNode_IDs.txt"
# GeoNetwork constants
GN_USERNAME = "admin"
GN_PASSWORD = "admin"
GN_BASE_URL = "http://localhost:8080"
GN_SERVLET_PATH = "/geonetwork/srv/en/"



def open_ogc_url_list(file_path = ""):
"""
Open space delimited file with GeoNetwork Harvest Node type
key (<WMS or WFS><WxS version>) and GetCapability URL value
(<GetCapabilities URL>) pairs. Retuns an array

Args:
file_path: System path input file

Returns:
An array with key-value pairs read from file
"""
print "***** START Import OGC URLs from file "+strftime("%Y-%m-%d %H:%M:%S")+" *****"
url_arr = []
try:
infile = file(file_path, 'r')
except IOError, e:
print ' ERROR: can\'t find file or read data: ', e
return url_arr
else:
for line in infile:
try:
# dumb way to check if there are URL key value pairs ....
test_list = [line.split()]
test = ' ' + test_list[-1][0] + ' - ' + test_list[-1][1]
# add <WMS or WFS><WxS version> and <GetCapabilities URL>
url_arr.append(line.split())
except:
print ' ERROR: Can\'t split or read name & value pair: ', line
return url_arr
finally:
infile.close()

print "***** END Import OGC URLs from file "+strftime("%Y-%m-%d %H:%M:%S")+" *****"
return url_arr


def url_to_hn_request(ogc_type="", url = ""):
"""
Transform GetCapabilities URL to GeoNetwork's xml.harvesting.add request.

Args:
ogc_type: GeoNetwork (GN) harvest node type in the following format: W[MF]S[0-9].[0-9].[0-9]
url: OGC WMS or WFS GetCapabilities URL

Returns:
GN Harvest Node xml.harvesting.add request XML
"""

# Check for URL validity
#if not 'service=wms' in url.lower() or not 'service=wfs' in url.lower():
#p_url = re.compile("service=W[MF]S") # search: service=WMS or service=WFS
p_url = re.compile(".*service=W[MF]S") # search any where: service=WMS or service=WFS
m_url = p_url.match(url) # match the search
if not m_url:
print " ERROR: URL does not match .*service=W[MF]S "+url
return ""

# Check for Harvest Node Type validity
p_type = re.compile("^W[MF]S[0-9].[0-9].[0-9]") # search: starts with WMS#.#.# or WFS#.#.#
m_type = p_type.match(ogc_type) # match the search
if not m_type:
print " ERROR: Harvest Node type does not match ^W[MF]S[0-9].[0-9].[0-9] "+ogc_type
return ""

# Optional - Change the GeoNetwork metadata record icon.
# The icon must be present in GeoNetworkl's images/harvesting folder.
#gn_metadata_icon = "default.gif"
gn_metadata_icon = "usgs.gif"

# construct info.xml file - must escape &, <, and > characters in XML element value strings
ogcwxs_xml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
ogcwxs_xml += "\
<node type=\"ogcwxs\">\
<site>\
<name>"+escape(url)+"</name>\
<account>\
<use>false</use>\
<username/>\
<password/>\
</account>\
<url>"+escape(url)+"</url>\
<ogctype>"+ogc_type+"</ogctype>\
<icon>"+gn_metadata_icon+"</icon>\
</site>\
<options>\
<!-- run every x minutes -->\
<every>10080</every>\
<!-- only run once -->\
<oneRunOnly>true</oneRunOnly>\
<lang>eng</lang>\
<topic>geoscientificInformation</topic>\
<createThumbnails>false</createThumbnails>\
<useLayer>false</useLayer>\
<useLayerMd>false</useLayerMd>\
<datasetCategory>1</datasetCategory>\
</options>\
<!-- Give \"All\" read access -->\
<privileges>\
<group id=\"1\">\
<operation name=\"view\"/>\
<operation name=\"dynamic\"/>\
</group>\
</privileges>\
<categories>\
<category id=\"3\"/>\
</categories>\
</node>"
#print ogcwxs_xml # debug

return ogcwxs_xml


def encode_json_file(obj, file_path = ""):
"""
Encode a (list) object to json and write it to a file

Args:
obj: a list objectc
file_path: path to file that will be created

Returns:
True on success
"""

# Variables
encoded = ""

# Encode the object
if obj:
encoded = json.dumps(obj)

if encoded is not "":
# Write json string to file
try:
json_file = open(file_path,'w')
json_file.write(encoded)
json_file.close()
except IOError:
print " EXCEPTION: failed to write "+file_path
return False
return True
else:
print " WARNING: Nothing to encode"
return False


def decode_json_file(file_path = ""):
"""
Encode a (list) object to json and write it to a file

Args:
obj: a list objectc
file_path: path to file that will be created

Returns:
List object
"""

# Variables
obj = []
encoded = ""

if file_path is not "":
# read file
try:
with open(file_path, 'r') as json_file:
encoded = json_file.read()
print encoded # debug

if encoded is not "":
# Create list object from json string
obj = json.loads(encoded)
else:
print " WARNING: File is empty: " + file_path

json_file.close()
except:
print " ERROR: Cannot open file: " + file_path
return obj
else:
print " ERROR: No file path specified: " + file_path

return obj


def add_hn(ogcwxs_xml_list = None, gn_url="", username = "", password = "" ):
"""
Check if Harvest Node with same URL does not exists, submit xml.harvesting.add, and execute harvest.
Includes loging in to GeoNetwork

Args:
ogcwxs_xml_list: [<xml.harvesting.add XML>]
gn_url: <URL to GN xml.harvesting.add service>
username: <GN user name>
password: <GN password>

Returns:
List of new GeoNetwork Harvest Node IDs
"""
print "***** START Add Harvest Node"+strftime("%Y-%m-%d %H:%M:%S")+" *****"

# "Do not use mutable objects as default values in the function or method definition."
if ogcwxs_xml_list is None:
ogcwxs_xml_list = []

# Variables
url_out = gn_url + "xml.user.logout"
url_in = gn_url + "xml.user.login"
url_harvest_get = gn_url + "xml.harvesting.get"
url_harvest_add = gn_url + "xml.harvesting.add"
hn_existing_url_list = ["http://localhost"] # List of GetCapabilities URLs in existing Harvest Nodes
hn_id_list = [] # List of newly added Harvest Node IDs

# HTTP header for authentication
header_urlencode = {"Content-type": "application/x-www-form-urlencoded", "Accept": "text/plain"}
# HTTP header for XML request
header_xml = {"Content-type": "application/xml", "Accept": "text/plain"}
# authentication Post parameters
post_parameters = urllib.urlencode({"username": username, "password": password})


# first, always log out
try:
request = urllib2.Request(url_out)
response = urllib2.urlopen(request)
except urllib2.URLError, e:
print ' EXCEPTION: Could not connect to GeoNetwork:'
print e
return False

# send authentication request
request = urllib2.Request(url_in, post_parameters, header_urlencode)
response = urllib2.urlopen(request)
# a basic memory-only cookie jar instance
cookies = cookielib.CookieJar()
cookies.extract_cookies(response, request)
cookie_handler= urllib2.HTTPCookieProcessor(cookies)
# a redirect handler
redirect_handler = urllib2.HTTPRedirectHandler()
# save cookie and redirect handler for future HTTP Posts
opener = urllib2.build_opener(redirect_handler, cookie_handler)


# Existing Harvest Node info request
xml_harvesting_get = "<request/>"
request = urllib2.Request(url_harvest_get, xml_harvesting_get, header_xml)
response = opener.open(request)
# Existing Harvest Node info response
xml_response = response.read()
#print xml_response # debug
# Get list of existing URLs
xmldoc = minidom.parseString(xml_response)
for node in xmldoc.getElementsByTagName('url'): # display value(s)
#print node.firstChild.nodeValue # debug
hn_existing_url_list.append(node.firstChild.nodeValue)
xmldoc.unlink() # cleanup DOM for improved performance

# Loop through xml.harvesting.add XML requests
for ogcwxs_xml in ogcwxs_xml_list:
#print ogcwxs_xml # debug
dublicate = False

# Parse out GetCapabilities URL
xmldoc = minidom.parseString(ogcwxs_xml)
new_url = xmldoc.getElementsByTagName('url')[0].firstChild.nodeValue
#print new_url # debug
xmldoc.unlink() # cleanup DOM for improved performance

# compare existing and new URLs
for url_exists in hn_existing_url_list:
if new_url == url_exists:
dublicate = True
print " WARNING: Harvest Node already exists for " + new_url

if dublicate == False:
# Submit new Harvest Node
request = urllib2.Request(url_harvest_add, ogcwxs_xml, header_xml)
response = opener.open(request)
# Send request
xml_response = response.read()
#print xml_response # debug

# Parse response
xmldoc = minidom.parseString(xml_response)
# Get element attribute value of <node id = "xxx">
hn_id = xmldoc.getElementsByTagName('node')[0].attributes["id"].value
# Get element value of <uuid>xxx</uid>
hn_uuid = xmldoc.getElementsByTagName('uuid')[0].firstChild.nodeValue
hn_id_list.append(hn_id)
print " New Harvest Node "
print " ID: "+ hn_id
print " UUID: "+ hn_uuid
xmldoc.unlink() # cleanup DOM for improved performance


# Last, always log out
request = urllib2.Request(url_out)
response = opener.open(request)
#print response.read() # debug

print "***** END Add Harvest Node"+strftime("%Y-%m-%d %H:%M:%S")+" *****"
return hn_id_list


def run_hn(hn_id_list = None, gn_url="", username = "", password = "" ):
"""
Just start the harvester now.

Args:
hn_id_list: [<GeoNetwork Node Harvest ID>]
gn_url: <URL to GN xml.harvesting.add service>
username: <GN user name>
password: <GN password>

Returns:
True
"""
print "***** START Run Harvest Node "+strftime("%Y-%m-%d %H:%M:%S")+" *****"

# "Do not use mutable objects as default values in the function or method definition."
if hn_id_list is None:
hn_id_list = []

# Variables
url_out = gn_url + "xml.user.logout"
url_in = gn_url + "xml.user.login"
url_harvest_run = gn_url + "xml.harvesting.run"
xml_harvest_run = ""

# HTTP header for authentication
header_urlencode = {"Content-type": "application/x-www-form-urlencoded", "Accept": "text/plain"}
# HTTP header for XML request
header_xml = {"Content-type": "application/xml", "Accept": "text/plain"}
# authentication Post parameters
post_parameters = urllib.urlencode({"username": username, "password": password})


# first, always log out
try:
request = urllib2.Request(url_out)
response = urllib2.urlopen(request)
except urllib2.URLError, e:
print ' EXCEPTION: Could not connect to GeoNetwork:'
print e
return False

# send authentication request
request = urllib2.Request(url_in, post_parameters, header_urlencode)
response = urllib2.urlopen(request)
# a basic memory-only cookie jar instance
cookies = cookielib.CookieJar()
cookies.extract_cookies(response, request)
cookie_handler = urllib2.HTTPCookieProcessor( cookies )
# a redirect handler
redirect_handler = urllib2.HTTPRedirectHandler()
# save cookie and redirect handler for future HTTP Posts
opener = urllib2.build_opener(redirect_handler, cookie_handler)


# Create request
xml_harvest_run = "<request>"
for hn_id in hn_id_list:
xml_harvest_run += "<id>"+hn_id+"</id>"
xml_harvest_run += "</request>"
#print xml_harvest_run # debug

# Request Harvest Node run
request = urllib2.Request(url_harvest_run, xml_harvest_run, header_xml)
response = opener.open(request)
# Show Response
xml_response = response.read()
print xml_response # debug

print "***** END Start Harvest Node "+strftime("%Y-%m-%d %H:%M:%S")+" *****"
return hn_id_list



def etl():
"""
Extract OGC GetCapabilities URLS from file, transform to GeoNetwork
Create OGC Harvest Node request and execute request, run GeoNetwork GetCapabilities
to ISO 19139 metadata harvest.
"""

# Variables
hn_xml_list = []

# Get ogctype types and GetCapabilities URLs from a file
url_list = open_ogc_url_list(OGC_URL_IN_FILE)
#print url_list # debug

# Genrate list with xml.harvesting.add XML
for url in url_list:
# cretae xml.harvesting.add XML
hn_xml = url_to_hn_request(url[0], url[1])
#print hn_xml #debug
hn_xml_list.append(hn_xml)
#print hn_xml_list[0] # debug

# Creat Harvest Node if it does not exist already - [<xml.harvesting.add XML>]
hn_id_list = add_hn(hn_xml_list, GN_BASE_URL + GN_SERVLET_PATH, GN_USERNAME, GN_PASSWORD)

# Write Harvest Node ID list to json encoded file if it the list is not empty
if not hn_id_list == []:
encode_json_file(hn_id_list, HN_ID_FILE)
else:
print " Harvest Node ID list is empty "

# read json encoded Harvest Node ID list if hn_id_list is empty
if hn_id_list == []:
hn_id_list = decode_json_file(HN_ID_FILE)
else:
print " Harvest Node ID List is empty "

# Run harvest - [<Harvest Node ID>]
run_hn(hn_id_list, GN_BASE_URL + GN_SERVLET_PATH, GN_USERNAME, GN_PASSWORD)


if __name__ == "__main__":
etl()
#cProfile.run('etl()') # execution performance information


Improve new ISO 19139 record with source FGDC metadata entries

To do.