Source code for minsci.xmu.tools.describer

"""Tools to describe and link multimedia using data from ecatalogue"""

import re
from collections import namedtuple
from copy import deepcopy

from ...helpers import (add_article, lcfirst, oxford_comma,
                        plural, singular, ucfirst)


# Objects that are sometimes found in the cut field and represent a whole
# object, not a setting. Entries in this list are processed in the order they
# appear.
OBJECTS = [
    'box',
    'bead',
    'bowl',
    'bottle',
    'cup',
    'pendant',
    'sphere',
    'urn',
    'vase',
    'carving'
]
INFLECTED = [(singular(_), plural(_)) for _ in OBJECTS]

# Terms that need hyphens to be properly formatted as adjectives
PAIRS = [
    ['  ', ' '],
    [',-', ', '],
    [' med ', ' medium '],
    ['-med-', '-medium-'],
    [' shaped', '-shaped'],
    ['off white', 'off-white'],
    ['play of color', 'play-of-color'],
    ['light medium', 'light-to-medium'],
    ['light to medium', 'light-to-medium'],
    ['light dark', 'light-to-dark'],
    ['light to dark', 'light-to-dark'],
    ['medium light', 'medium-to-light'],
    ['medium to light', 'medium-to-light'],
    ['medium dark', 'medium-to-dark'],
    ['medium to dark', 'medium-to-dark'],
    ['dark light', 'dark-to-light'],
    ['dark to light', 'dark-to-light'],
    ['medium light', 'medium-to-light'],
    ['medium to light', 'medium-to-light']
]

ALWAYS_PLURAL = [
    'bead'
]

Description = namedtuple('Description', ['object', 'caption',
                                         'keywords', 'summary'])


[docs]def summarize(rec): """Summarizes basic information about an object""" rec.module = 'ecatalogue' # force module to ecatalogue descriptors = get_descriptors(rec) caption = get_caption(descriptors=descriptors) keywords = get_keywords(descriptors=descriptors) tags = get_tags(descriptors=descriptors) # Write summary line used to make a quick id of sample (for example, when # matching media to samples) catnum = descriptors['catnum'] summary = u'{}: {} [{}]'.format(catnum, caption, tags).rstrip('[] ') # Cull unneeded keys from descriptors keep = ['irn', 'catnum', 'status', 'xname', 'url'] obj = {key: val for key, val in descriptors.iteritems() if key in keep} obj['xname'] = ucfirst(obj['xname']) return Description(object=obj, caption=caption, keywords=keywords, summary=summary)
[docs]def get_descriptors(rec): """Parses basic descriptive information about a record into a dict""" name = rec('MinName') if rec('MinName') else rec('MetMeteoriteName') catnum = rec.get_identifier(include_div=True, force_catnum=True) if catnum.split('(')[0].strip() == 'USNM': catnum = name + ' (MET)' taxa = rec.get_classification() try: xname = rec.get_name(taxa=taxa, force_derived=True) except KeyError: xname = name kind = rec('CatCatalog').split(' ')[0].rstrip('s') cut, setting = format_gems(rec) country, state, county = rec.get_political_geography() description = rec('BioLiveSpecimen').lower().rstrip('.').replace('"', "'") if description == name.lower(): description = '' weight = rec.get_current_weight() if kind == 'Meteorite' else '' descriptors = { 'irn': rec('irn'), 'catnum': catnum, 'name': name, 'xname': xname, 'taxa': taxa, 'kind': kind, 'cut': cut, 'setting': setting, 'colors': format_colors(rec), 'locality': format_locality(country, state, county), 'country': country, 'state': state, 'weight': weight, 'description': description, 'status': rec('SecRecordStatus').lower(), 'url': rec.get_url() } if descriptors['kind'] != 'Meteorite': descriptors['xname'] = lcfirst(descriptors['xname']) return descriptors
[docs]def get_caption(rec=None, descriptors=None): """Derives a simple descripton of an object""" if descriptors is None: descriptors = get_descriptors(rec) lines = [format_caption(descriptors)] # Mark inactive records if descriptors['status'] and descriptors['status'] != 'active': status = descriptors['status'] if status == 'inactive': status = 'made inactive' lines.append('The catalog record associated with this' ' specimen has been {}.'.format(status)) caption = '. '.join([s.rstrip('. ') for s in lines]) if not caption.endswith(('.', '"')): caption += '.' return caption
[docs]def get_keywords(rec=None, descriptors=None): """Sets multimedia keywords for the given object""" if descriptors is None: descriptors = get_descriptors(rec) keywords = [] for key in ['kind', 'setting']: keywords.append(descriptors[key]) keywords.extend(descriptors['taxa']) keywords.append(descriptors['country']) if descriptors['country'].lower() == 'united states': keywords.append(descriptors['state']) return [ucfirst(s) for s in keywords if s and not 'unknow' in s.lower()]
[docs]def get_tags(rec=None, descriptors=None): """Sets tags with special information useful in identifying objects""" if descriptors is None: descriptors = get_descriptors(rec) tags = [] #if obj.collections and 'polished thin' in obj.collections[0].lower(): # tags.append('PTS') #if 'GGM' in obj.location.upper(): # tags.append('GGM') #elif 'POD 4' in obj.location.upper(): # tags.append('POD 4') return tags
[docs]def clean_caption(caption): """Cleans vestigial phrases from caption""" while ' ' in caption: caption = caption.replace(' ', ' ') return ucfirst(caption.strip('., ') \ .replace('from weighing', 'weighing') \ .replace('colored from', 'from') \ .replace('weighing .', '.') \ .replace('Described as "."', '') \ .replace('carved,', 'carved') \ .replace(' , ', ' ') \ .replace(' . ', '. ') \ .replace('..', '.') \ .replace(',-', ', '))
[docs]def format_caption(descriptors): """Formats caption based on the information in descriptors""" working = deepcopy(descriptors) # Make global changes to descriptors if working['catnum'].endswith('(MET)'): working['description'] = '' xname = re.split('[ -]', working['xname'], 1)[0] if xname.isalpha() and not xname == xname.upper(): working['xname'] = lcfirst(working['xname']) # Select a mask and format the data for it if (working['cut'] and not working['cut'] in ('carved', 'intarsia') and not 'beads' in working['cut']): working['cut'] = format_modifier(working['cut']) + '-cut' if working['setting'].lower() in OBJECTS: working['colors'] = format_modifier(oxford_comma(working['colors'])) mask = u'{cut}, {colors} {xname} {setting}' elif working['setting'] and 'beads' in working['cut']: working['setting'] = add_article(working['setting']) working['colors'] = format_modifier(oxford_comma(working['colors'])) mask = u'{setting} featuring {colors} {xname} {cut}' elif working['setting']: working['setting'] = add_article(working['setting']) working['colors'] = format_modifier(oxford_comma(working['colors'])) mask = u'{setting} featuring {cut}, {colors} {xname}' elif working['cut'] and len(working['colors']) <= 2: working['colors'] = format_modifier(oxford_comma(working['colors'])) mask = u'{cut}, {colors} {xname} from {locality}' elif working['cut']: working['colors'] = oxford_comma(working['colors']) mask = u'{cut} {xname} colored {colors} from {locality}' elif working['name'] and not working['locality']: mask = u'' else: mask = u'{xname} from {locality}' # Add common elements prefix = u'{name}.' suffix = u'weighing {weight}. Described as "{description}."' mask = u' '.join([s for s in [prefix, mask, suffix] if s]) caption = clean_caption(mask.format(**working)) # Fix capitalization of second sentence when name is specified if working['name']: sentences = caption.split('. ', 1) if not sentences[1].startswith('Described'): sentences[1] = ucfirst(add_article(sentences[1])) caption = '. '.join(sentences) return caption
[docs]def format_colors(rec): """Formats colors""" colors = rec('MinColor_tab') if colors and not ',' in colors[0] and not is_multiple(rec('MinCut')): colors = colors[0].lower().replace(' ', '-') return [re.sub('\bmed\b', 'medium', s.strip('- '), flags=re.I) for s in colors.split(',') if s != 'various'] return []
[docs]def format_locality(country, state, county): """Formats locality info as a comma-delimited string""" if 'Unknown' in country or 'Synthetic' in country: return '' if country == 'United States': return ', '.join([s for s in [county, state] if s]) return ', '.join([s for s in [county, state, country] if s])
[docs]def format_gems(rec): """Formats setting and cut of jewellery""" setting = rec('MinJeweleryType').lower() cut = rec('MinCut').lower() if is_multiple(cut): cut = '' if setting or cut: # Derive object type from cut if not setting is given if not setting: for term in INFLECTED: for inflection in term: if inflection in cut: setting = inflection if setting in cut: cut = u'' break if cut == 'intarsia': setting = u'{} {}'.format(cut, setting) cut = u'' # Standardize the formatting of cut if cut in ALWAYS_PLURAL: cut = plural(cut) if cut in ['various']: cut = '' while cut[-4:] in (' cut', '-cut'): cut = cut[:-4] if setting == 'carving' and not cut: cut = u'carved' setting = u'' # Format setting if setting in ALWAYS_PLURAL: setting = plural(setting) if setting in cut: setting = u'' setting = setting.lower().rstrip('. ') return cut, setting
[docs]def format_modifier(modifier): """Formats a string as a compound modifier""" words = [s.strip('. ') for s in re.split(r'[\s\-]+', modifier.strip())] formatted = [s + ' ' if is_adverb(s) and not i else s + '-' for i, s in enumerate(words)] return ''.join(formatted).rstrip('-')
[docs]def is_adverb(word): """Simplistically checks if a word is an adverb""" word = word.lower() return word == 'very' or word.endswith('ly')
[docs]def is_multiple(phrase): """Simplistically checks if a phrase contains multiple items""" return ',' in phrase or ' and ' in phrase