Source code for minsci.xmu.tools.matcher

"""Tools to match EMu records for making attachments"""

import copy
import json
import os

from unidecode import unidecode

from ...xmu import XMu, MinSciRecord, is_reference


[docs]def standardize_taxon(species): """Standardize formatting of classification to improve matching""" species = unidecode(species).replace('-', ' ') if species.count(',') == 1: species = ' '.join([s.strip() for s in species.split(',')[::-1]]) return species
# List of fields to include for the attachment search. All other fields will # be excluded. INCLUDE = { 'ebibliography': ['ArtTitle', 'ArtParentRef', 'ArtVolume', 'ArtIssue'], 'ecatalogue': ['CatPrefix', 'CatNumber', 'CatSuffix', 'CatDivision'], 'elocations': ['LocLevel{}'.format(x) for x in xrange(1, 9)], 'eparties': ['NamFirst', 'NamMiddle', 'NamLast', 'NamOrganisation'], 'etaxonomy': ['ClaSpecies'] } # List of fields to exclude from the attachment search. All other fields will # be included. EXCLUDE = { 'ecollectionevents': ['ColParticipantStringAuto', 'ColParticipantString', 'LatCentroidLatitude0', 'LatCentroidLatitudeDec_tab', 'LatCentroidLongitude0', 'LatCentroidLongitudeDec_tab', 'LatPreferred_tab', 'LocRecordClassification' 'LocSiteStationNumber', 'LocSiteStationSource', 'LocSiteName_tab', 'MulMultiMediaRef_tab', 'LocContinent', 'VolRegionName', 'VolSubRegionName'] } # Contains lists of fields that contain the same data converted to different # units. This causes matching problems, and only one of these should be # included in the data passed back to EMu if no match can be made offline. DERIVED = { 'ecollectionevents': [ ['AquBottomDepthFromFath', 'AquBottomDepthFromFt', 'AquBottomDepthFromMet'], ['AquBottomDepthToFath', 'AquBottomDepthToFt', 'AquBottomDepthToMet'], ['LatLatitude_nesttab', 'LatLatitudeDecimal_nesttab', 'LatLatitudeVerbatim_nesttab'], ['LatLongitude_nesttab', 'LatLongitudeDecimal_nesttab', 'LatLongitudeVerbatim_nesttab'], ['TerElevationFromFt', 'TerElevationFromMet'], ['TerElevationToFt', 'TerElevationToMet'], ] } # Transform data in certain fields to handle different formats, etc. TRANSFORMATIONS = { 'etaxonomy': {'ClaSpecies': standardize_taxon} }
[docs]class Matcher(XMu): """Match data from a given record to existing EMu records Attributes: fields (list): the subset of EMu fields used to perform the match. If fields is None, all fields in the source will be considered. from_json (bool): specifies whether fields lookup was read from a pre-existing JSON file module (str): the name of the module new (list): records that do not exist in EMu """ def __init__(self, module, include=None, exclude=None): print 'Creating attachment search for {}...'.format(module) fp = os.path.join('matcher', '{}'.format(module)) super(Matcher, self).__init__(fp, module=module, container=MinSciRecord) self.keep = ['_records', '_fields'] self.include = INCLUDE.get(module, []) if include is None else include self.exclude = EXCLUDE.get(module, []) if exclude is None else exclude self.transformations = TRANSFORMATIONS.get(module, {}) self.derived = DERIVED.get(module, {}) self.new = [] self.write = False json_path = os.path.join('matcher', '{}.json'.format(module)) #os.remove(json_path) try: self.load(json_path) except IOError: self._records = {} self._fields = [] fp = os.path.join('matcher', module) self.fast_iter(report=10000) self._fields = list(set(self._fields)) print self._fields self.save(json_path) self._fields.sort()
[docs] def iterate(self, element): """Populate dict used for matching""" rec = self.parse(element) # HACK: Skip sites if given by collector if (self.module == 'ecollectionevents' and rec('LocSiteNumberSource') == 'Collector'): return True irn = rec.pop('irn') # IRN will never be included in the match set key = self.keyer(rec) self._fields.extend(rec.keys()) if key: data = self.container({'irn': irn}) self._records.setdefault(key, []).append(data) else: raise ValueError(rec)
[docs] def keyer(self, rec): """Format a value as a standard key to use for matching Args: rec (XMuRecord): the record to match or match against Returns: A JSON-encoded string representing the desired fields from the source record """ # Prune a copy of the source record, then remove any key that does # not appear in the fields attribute. rec.prune() for key in rec.keys(): if ((self.include and not key in self.include) or key in self.exclude): del rec[key] elif self.transformations: try: rec[key] = self.transformations[key](rec[key]) except KeyError: pass return json.dumps(rec, sort_keys=True).lower()
[docs] def match(self, match_data, match_once=False): """Match record against the existing record set Args: match_data (dict): object data match_once (bool): if true, the record in the match dictionary will be deleted once it is matched Returns: Record modified to to include irn if match can be made """ # No need to match if any IRN is provided or if there are no records # to match against, so check those parameters first. irn = match_data.get('irn') if irn is not None: return self.container({'irn': irn}) elif not self._records: return self._prepare(match_data) else: # Check for a match key = self.keyer(self.container(match_data).expand()) if key: try: irn = self._records[key][0] except (IndexError, KeyError): rec = self._prepare(match_data) try: del rec['irn'] except KeyError: pass if not rec in self.new: self.new.append(rec) return rec else: if match_once: del self._records[key][0] return irn return None
[docs] def attach(self, rec, fields, mapper): """Attach a record from another module to the provided record Args: rec (XMuRecord): an expanded XMu record mapper (Mapper): a Mapper object for the current record """ # Note: bad matches can be made if the record being matched includes # fields that are not accounted for in the Matcher object. root = True if is_reference(mapper(fields[0])[0]): root = False match_data = {} for field in fields: try: match_data[field] = copy.deepcopy(rec[field]) if not root: del rec[field] except KeyError: pass if match_data: mapper.expand(match_data) if not root: attach_field = match_data.keys()[0] match_data = match_data[attach_field] else: attach_field = 'irn' if isinstance(match_data, list): attachment = [self.match(row, root) for row in match_data if row] else: attachment = self.match(match_data, root) if any(attachment) and not all(attachment): raise Exception('Row {}: Bad attachment on' ' {}'.format(rec['_RowNumber'], attach_field)) return attachment
def _prepare(self, rec): """Fill out an EMu record to include all fields used to match Args: rec (xmu.DeepDict): object data """ if self.module == 'etaxonomy': rec.setdefault('ClaCurrentlyAccepted', 'Unknown') # Explicitly include the fields that should be empty. Fields that # we don't want to use as part of the match (as defined by the include # and exclude attributes) are also removed here. for key in self._fields: try: rec[key] except KeyError: pass #rec_key = [] if is_table(key) else u'' if ((self.include and not key in self.include) or key in self.exclude): del rec[key] # Expand into a full EMu record rec = self.container(rec).expand() # EMu does not automatically exclude inactive records, so we need to # specify that we only want active records in both the base record # and any references therein. Only goes one deep right now. rec['SecRecordStatus'] = 'Active' for field in [field for field in rec if is_reference(field)]: if rec[field]: try: if not 'irn' in rec[field]: rec[field]['SecRecordStatus'] = 'Active' except TypeError: for i in xrange(len(rec[field])): if not 'irn' in rec[field][i]: rec[field][i]['SecRecordStatus'] = 'Active' # Look for derivative fields. EMu includes a handful of groups of # related fields that are derived from each other (coordiantes, # elevation, and depth). Matching against these fields is a problems # because it's unlikely that the import will include more than one, # so we check for those here. for sources in self.derived: keep = u'' for src in sources: if rec.get(src) is not None: keep = src for derived in [fld for fld in sources if not fld == keep]: try: del rec[derived] except KeyError: pass break return rec
[docs]def rower(rec, cols): """Group data from different fields into rows""" rows = [] for col in cols: for i, val in enumerate(rec.get(col, [])): try: rows[i][col] = val except IndexError: rows.append({col: val}) return rows