Source code for minsci.xmu.tools.describer

"""Tools to describe and link multimedia using data from ecatalogue"""

import re
from collections import namedtuple
from copy import deepcopy

from ...helpers import (add_article, lcfirst, oxford_comma,
                        plural, singular, ucfirst)


# Objects that are sometimes found in the cut field and represent a whole
# object, not a setting. Entries in this list are processed in the order they
# appear.
OBJECTS = [
    'box',
    'bead',
    'bowl',
    'bottle',
    'cup',
    'pendant',
    'sphere',
    'urn',
    'vase',
    'carving'
]
INFLECTED = [(singular(_), plural(_)) for _ in OBJECTS]

# Terms that need hyphens to be properly formatted as adjectives
PAIRS = [
    ['  ', ' '],
    [',-', ', '],
    [' med ', ' medium '],
    ['-med-', '-medium-'],
    [' shaped', '-shaped'],
    ['off white', 'off-white'],
    ['play of color', 'play-of-color'],
    ['light medium', 'light-to-medium'],
    ['light to medium', 'light-to-medium'],
    ['light dark', 'light-to-dark'],
    ['light to dark', 'light-to-dark'],
    ['medium light', 'medium-to-light'],
    ['medium to light', 'medium-to-light'],
    ['medium dark', 'medium-to-dark'],
    ['medium to dark', 'medium-to-dark'],
    ['dark light', 'dark-to-light'],
    ['dark to light', 'dark-to-light'],
    ['medium light', 'medium-to-light'],
    ['medium to light', 'medium-to-light']
]

ALWAYS_PLURAL = [
    'bead'
]

Description = namedtuple('Description', ['object', 'caption',
                                         'keywords', 'summary'])


[docs]def summarize(rec):
    """Summarizes basic information about an object"""
    rec.module = 'ecatalogue'  # force module to ecatalogue
    descriptors = get_descriptors(rec)
    caption = get_caption(descriptors=descriptors)
    keywords = get_keywords(descriptors=descriptors)
    tags = get_tags(descriptors=descriptors)
    # Write summary line used to make a quick id of sample (for example, when
    # matching media to samples)
    catnum = descriptors['catnum']
    summary = u'{}: {} [{}]'.format(catnum, caption, tags).rstrip('[] ')
    # Cull unneeded keys from descriptors
    keep = ['irn', 'catnum', 'status', 'xname', 'url']
    obj = {key: val for key, val in descriptors.iteritems() if key in keep}
    obj['xname'] = ucfirst(obj['xname'])
    return Description(object=obj, caption=caption,
                       keywords=keywords, summary=summary)


[docs]def get_descriptors(rec):
    """Parses basic descriptive information about a record into a dict"""
    name = rec('MinName') if rec('MinName') else rec('MetMeteoriteName')
    catnum = rec.get_identifier(include_div=True, force_catnum=True)
    if catnum.split('(')[0].strip() == 'USNM':
        catnum = name + ' (MET)'
    taxa = rec.get_classification()
    try:
        xname = rec.get_name(taxa=taxa, force_derived=True)
    except KeyError:
        xname = name
    kind = rec('CatCatalog').split(' ')[0].rstrip('s')
    cut, setting = format_gems(rec)
    country, state, county = rec.get_political_geography()
    description = rec('BioLiveSpecimen').lower().rstrip('.').replace('"', "'")
    if description == name.lower():
        description = ''
    weight = rec.get_current_weight() if kind == 'Meteorite' else ''
    descriptors = {
        'irn': rec('irn'),
        'catnum': catnum,
        'name': name,
        'xname': xname,
        'taxa': taxa,
        'kind': kind,
        'cut': cut,
        'setting': setting,
        'colors': format_colors(rec),
        'locality': format_locality(country, state, county),
        'country': country,
        'state': state,
        'weight': weight,
        'description': description,
        'status': rec('SecRecordStatus').lower(),
        'url': rec.get_url()
    }
    if descriptors['kind'] != 'Meteorite':
        descriptors['xname'] = lcfirst(descriptors['xname'])
    return descriptors


[docs]def get_caption(rec=None, descriptors=None):
    """Derives a simple descripton of an object"""
    if descriptors is None:
        descriptors = get_descriptors(rec)
    lines = [format_caption(descriptors)]
    # Mark inactive records
    if descriptors['status'] and descriptors['status'] != 'active':
        status = descriptors['status']
        if status == 'inactive':
            status = 'made inactive'
        lines.append('The catalog record associated with this'
                     ' specimen has been {}.'.format(status))
    caption = '. '.join([s.rstrip('. ') for s in lines])
    if not caption.endswith(('.', '"')):
        caption += '.'
    return caption


[docs]def get_keywords(rec=None, descriptors=None):
    """Sets multimedia keywords for the given object"""
    if descriptors is None:
        descriptors = get_descriptors(rec)
    keywords = []
    for key in ['kind', 'setting']:
        keywords.append(descriptors[key])
    keywords.extend(descriptors['taxa'])
    keywords.append(descriptors['country'])
    if descriptors['country'].lower() == 'united states':
        keywords.append(descriptors['state'])
    return [ucfirst(s) for s in keywords if s and not 'unknow' in s.lower()]


[docs]def get_tags(rec=None, descriptors=None):
    """Sets tags with special information useful in identifying objects"""
    if descriptors is None:
        descriptors = get_descriptors(rec)
    tags = []
    #if obj.collections and 'polished thin' in obj.collections[0].lower():
    #    tags.append('PTS')
    #if 'GGM' in obj.location.upper():
    #    tags.append('GGM')
    #elif 'POD 4' in obj.location.upper():
    #    tags.append('POD 4')
    return tags


[docs]def clean_caption(caption):
    """Cleans vestigial phrases from caption"""
    while '  ' in caption:
        caption = caption.replace('  ', ' ')
    return ucfirst(caption.strip('., ') \
                          .replace('from weighing', 'weighing') \
                          .replace('colored from', 'from') \
                          .replace('weighing .', '.') \
                          .replace('Described as "."', '') \
                          .replace('carved,', 'carved') \
                          .replace(' , ', ' ') \
                          .replace(' . ', '. ') \
                          .replace('..', '.') \
                          .replace(',-', ', '))


[docs]def format_caption(descriptors):
    """Formats caption based on the information in descriptors"""
    working = deepcopy(descriptors)
    # Make global changes to descriptors
    if working['catnum'].endswith('(MET)'):
        working['description'] = ''
        xname = re.split('[ -]', working['xname'], 1)[0]
        if xname.isalpha() and not xname == xname.upper():
            working['xname'] = lcfirst(working['xname'])
    # Select a mask and format the data for it
    if (working['cut']
            and not working['cut'] in ('carved', 'intarsia')
            and not 'beads' in working['cut']):
        working['cut'] = format_modifier(working['cut']) + '-cut'
    if working['setting'].lower() in OBJECTS:
        working['colors'] = format_modifier(oxford_comma(working['colors']))
        mask = u'{cut}, {colors} {xname} {setting}'
    elif working['setting'] and 'beads' in working['cut']:
        working['setting'] = add_article(working['setting'])
        working['colors'] = format_modifier(oxford_comma(working['colors']))
        mask = u'{setting} featuring {colors} {xname} {cut}'
    elif working['setting']:
        working['setting'] = add_article(working['setting'])
        working['colors'] = format_modifier(oxford_comma(working['colors']))
        mask = u'{setting} featuring {cut}, {colors} {xname}'
    elif working['cut'] and len(working['colors']) <= 2:
        working['colors'] = format_modifier(oxford_comma(working['colors']))
        mask = u'{cut}, {colors} {xname} from {locality}'
    elif working['cut']:
        working['colors'] = oxford_comma(working['colors'])
        mask = u'{cut} {xname} colored {colors} from {locality}'
    elif working['name'] and not working['locality']:
        mask = u''
    else:
        mask = u'{xname} from {locality}'
    # Add common elements
    prefix = u'{name}.'
    suffix = u'weighing {weight}. Described as "{description}."'
    mask = u' '.join([s for s in [prefix, mask, suffix] if s])
    caption = clean_caption(mask.format(**working))
    # Fix capitalization of second sentence when name is specified
    if working['name']:
        sentences = caption.split('. ', 1)
        if not sentences[1].startswith('Described'):
            sentences[1] = ucfirst(add_article(sentences[1]))
            caption = '. '.join(sentences)
    return caption


[docs]def format_colors(rec):
    """Formats colors"""
    colors = rec('MinColor_tab')
    if colors and not ',' in colors[0] and not is_multiple(rec('MinCut')):
        colors = colors[0].lower().replace(' ', '-')
        return [re.sub('\bmed\b', 'medium', s.strip('- '), flags=re.I)
                for s in colors.split(',') if s != 'various']
    return []


[docs]def format_locality(country, state, county):
    """Formats locality info as a comma-delimited string"""
    if 'Unknown' in country or 'Synthetic' in country:
        return ''
    if country == 'United States':
        return ', '.join([s for s in [county, state] if s])
    return ', '.join([s for s in [county, state, country] if s])



[docs]def format_gems(rec):
    """Formats setting and cut of jewellery"""
    setting = rec('MinJeweleryType').lower()
    cut = rec('MinCut').lower()
    if is_multiple(cut):
        cut = ''
    if setting or cut:
        # Derive object type from cut if not setting is given
        if not setting:
            for term in INFLECTED:
                for inflection in term:
                    if inflection in cut:
                        setting = inflection
                        if setting in cut:
                            cut = u''
                        break
        if cut == 'intarsia':
            setting = u'{} {}'.format(cut, setting)
            cut = u''
        # Standardize the formatting of cut
        if cut in ALWAYS_PLURAL:
            cut = plural(cut)
        if cut in ['various']:
            cut = ''
        while cut[-4:] in (' cut', '-cut'):
            cut = cut[:-4]
        if setting == 'carving' and not cut:
            cut = u'carved'
            setting = u''
        # Format setting
        if setting in ALWAYS_PLURAL:
            setting = plural(setting)
        if setting in cut:
            setting = u''
        setting = setting.lower().rstrip('. ')
    return cut, setting


[docs]def format_modifier(modifier):
    """Formats a string as a compound modifier"""
    words = [s.strip('. ') for s in re.split(r'[\s\-]+', modifier.strip())]
    formatted = [s + ' ' if is_adverb(s) and not i else s + '-'
                 for i, s in enumerate(words)]
    return ''.join(formatted).rstrip('-')


[docs]def is_adverb(word):
    """Simplistically checks if a word is an adverb"""
    word = word.lower()
    return word == 'very' or word.endswith('ly')


[docs]def is_multiple(phrase):
    """Simplistically checks if a phrase contains multiple items"""
    return ',' in phrase or ' and ' in phrase