"""Tools to match EMu records for making attachments"""
import copy
import json
import os
from unidecode import unidecode
from ...xmu import XMu, MinSciRecord, is_reference
[docs]def standardize_taxon(species):
"""Standardize formatting of classification to improve matching"""
species = unidecode(species).replace('-', ' ')
if species.count(',') == 1:
species = ' '.join([s.strip() for s in species.split(',')[::-1]])
return species
# List of fields to include for the attachment search. All other fields will
# be excluded.
INCLUDE = {
'ebibliography': ['ArtTitle', 'ArtParentRef', 'ArtVolume', 'ArtIssue'],
'ecatalogue': ['CatPrefix', 'CatNumber', 'CatSuffix', 'CatDivision'],
'elocations': ['LocLevel{}'.format(x) for x in xrange(1, 9)],
'eparties': ['NamFirst', 'NamMiddle', 'NamLast', 'NamOrganisation'],
'etaxonomy': ['ClaSpecies']
}
# List of fields to exclude from the attachment search. All other fields will
# be included.
EXCLUDE = {
'ecollectionevents': ['ColParticipantStringAuto',
'ColParticipantString',
'LatCentroidLatitude0',
'LatCentroidLatitudeDec_tab',
'LatCentroidLongitude0',
'LatCentroidLongitudeDec_tab',
'LatPreferred_tab',
'LocRecordClassification'
'LocSiteStationNumber',
'LocSiteStationSource',
'LocSiteName_tab',
'MulMultiMediaRef_tab',
'LocContinent',
'VolRegionName',
'VolSubRegionName']
}
# Contains lists of fields that contain the same data converted to different
# units. This causes matching problems, and only one of these should be
# included in the data passed back to EMu if no match can be made offline.
DERIVED = {
'ecollectionevents': [
['AquBottomDepthFromFath', 'AquBottomDepthFromFt', 'AquBottomDepthFromMet'],
['AquBottomDepthToFath', 'AquBottomDepthToFt', 'AquBottomDepthToMet'],
['LatLatitude_nesttab', 'LatLatitudeDecimal_nesttab', 'LatLatitudeVerbatim_nesttab'],
['LatLongitude_nesttab', 'LatLongitudeDecimal_nesttab', 'LatLongitudeVerbatim_nesttab'],
['TerElevationFromFt', 'TerElevationFromMet'],
['TerElevationToFt', 'TerElevationToMet'],
]
}
# Transform data in certain fields to handle different formats, etc.
TRANSFORMATIONS = {
'etaxonomy': {'ClaSpecies': standardize_taxon}
}
[docs]class Matcher(XMu):
"""Match data from a given record to existing EMu records
Attributes:
fields (list): the subset of EMu fields used to perform the match.
If fields is None, all fields in the source will be considered.
from_json (bool): specifies whether fields lookup was read from a
pre-existing JSON file
module (str): the name of the module
new (list): records that do not exist in EMu
"""
def __init__(self, module, include=None, exclude=None):
print 'Creating attachment search for {}...'.format(module)
fp = os.path.join('matcher', '{}'.format(module))
super(Matcher, self).__init__(fp, module=module, container=MinSciRecord)
self.keep = ['_records', '_fields']
self.include = INCLUDE.get(module, []) if include is None else include
self.exclude = EXCLUDE.get(module, []) if exclude is None else exclude
self.transformations = TRANSFORMATIONS.get(module, {})
self.derived = DERIVED.get(module, {})
self.new = []
self.write = False
json_path = os.path.join('matcher', '{}.json'.format(module))
#os.remove(json_path)
try:
self.load(json_path)
except IOError:
self._records = {}
self._fields = []
fp = os.path.join('matcher', module)
self.fast_iter(report=10000)
self._fields = list(set(self._fields))
print self._fields
self.save(json_path)
self._fields.sort()
[docs] def iterate(self, element):
"""Populate dict used for matching"""
rec = self.parse(element)
# HACK: Skip sites if given by collector
if (self.module == 'ecollectionevents'
and rec('LocSiteNumberSource') == 'Collector'):
return True
irn = rec.pop('irn') # IRN will never be included in the match set
key = self.keyer(rec)
self._fields.extend(rec.keys())
if key:
data = self.container({'irn': irn})
self._records.setdefault(key, []).append(data)
else:
raise ValueError(rec)
[docs] def keyer(self, rec):
"""Format a value as a standard key to use for matching
Args:
rec (XMuRecord): the record to match or match against
Returns:
A JSON-encoded string representing the desired fields from
the source record
"""
# Prune a copy of the source record, then remove any key that does
# not appear in the fields attribute.
rec.prune()
for key in rec.keys():
if ((self.include and not key in self.include)
or key in self.exclude):
del rec[key]
elif self.transformations:
try:
rec[key] = self.transformations[key](rec[key])
except KeyError:
pass
return json.dumps(rec, sort_keys=True).lower()
[docs] def match(self, match_data, match_once=False):
"""Match record against the existing record set
Args:
match_data (dict): object data
match_once (bool): if true, the record in the match dictionary
will be deleted once it is matched
Returns:
Record modified to to include irn if match can be made
"""
# No need to match if any IRN is provided or if there are no records
# to match against, so check those parameters first.
irn = match_data.get('irn')
if irn is not None:
return self.container({'irn': irn})
elif not self._records:
return self._prepare(match_data)
else:
# Check for a match
key = self.keyer(self.container(match_data).expand())
if key:
try:
irn = self._records[key][0]
except (IndexError, KeyError):
rec = self._prepare(match_data)
try:
del rec['irn']
except KeyError:
pass
if not rec in self.new:
self.new.append(rec)
return rec
else:
if match_once:
del self._records[key][0]
return irn
return None
[docs] def attach(self, rec, fields, mapper):
"""Attach a record from another module to the provided record
Args:
rec (XMuRecord): an expanded XMu record
mapper (Mapper): a Mapper object for the current record
"""
# Note: bad matches can be made if the record being matched includes
# fields that are not accounted for in the Matcher object.
root = True
if is_reference(mapper(fields[0])[0]):
root = False
match_data = {}
for field in fields:
try:
match_data[field] = copy.deepcopy(rec[field])
if not root:
del rec[field]
except KeyError:
pass
if match_data:
mapper.expand(match_data)
if not root:
attach_field = match_data.keys()[0]
match_data = match_data[attach_field]
else:
attach_field = 'irn'
if isinstance(match_data, list):
attachment = [self.match(row, root)
for row in match_data if row]
else:
attachment = self.match(match_data, root)
if any(attachment) and not all(attachment):
raise Exception('Row {}: Bad attachment on'
' {}'.format(rec['_RowNumber'], attach_field))
return attachment
def _prepare(self, rec):
"""Fill out an EMu record to include all fields used to match
Args:
rec (xmu.DeepDict): object data
"""
if self.module == 'etaxonomy':
rec.setdefault('ClaCurrentlyAccepted', 'Unknown')
# Explicitly include the fields that should be empty. Fields that
# we don't want to use as part of the match (as defined by the include
# and exclude attributes) are also removed here.
for key in self._fields:
try:
rec[key]
except KeyError:
pass #rec_key = [] if is_table(key) else u''
if ((self.include and not key in self.include)
or key in self.exclude):
del rec[key]
# Expand into a full EMu record
rec = self.container(rec).expand()
# EMu does not automatically exclude inactive records, so we need to
# specify that we only want active records in both the base record
# and any references therein. Only goes one deep right now.
rec['SecRecordStatus'] = 'Active'
for field in [field for field in rec if is_reference(field)]:
if rec[field]:
try:
if not 'irn' in rec[field]:
rec[field]['SecRecordStatus'] = 'Active'
except TypeError:
for i in xrange(len(rec[field])):
if not 'irn' in rec[field][i]:
rec[field][i]['SecRecordStatus'] = 'Active'
# Look for derivative fields. EMu includes a handful of groups of
# related fields that are derived from each other (coordiantes,
# elevation, and depth). Matching against these fields is a problems
# because it's unlikely that the import will include more than one,
# so we check for those here.
for sources in self.derived:
keep = u''
for src in sources:
if rec.get(src) is not None:
keep = src
for derived in [fld for fld in sources if not fld == keep]:
try:
del rec[derived]
except KeyError:
pass
break
return rec
[docs]def rower(rec, cols):
"""Group data from different fields into rows"""
rows = []
for col in cols:
for i, val in enumerate(rec.get(col, [])):
try:
rows[i][col] = val
except IndexError:
rows.append({col: val})
return rows