Source code for minsci.xmu.tools.biblio.doi

# -*- coding: utf-8 -*-
"""Populates ebibliography records based on BibTeX records pulled using DOI"""

import csv
import io
import logging
import os
import pprint as pp
import re
from datetime import datetime

import requests
import requests_cache
from dateparser import parse
from nameparser import HumanName

from .bibbot import BibBot
from .ris import split_records, ris2dict
from ....xmu.constants import FIELDS
from ....xmu import XMu, BiblioRecord, write


MODULE = 'ebibliography'

PUB_TYPES = {
    'article': 'Art',
    'book': 'Boo',
    'incollection': 'Art',
    'techreport': 'Art',
    'thes': 'The'
}

SOURCES = {
    'Book': 'booktitle',
    'Journal': 'journal'
}

# List of entities from BibTeX
ENTITIES = {
    r'$\mathsemicolon$': u';',
    r'{\{AE}}': u'Æ',
    r'({IUCr})': u'(IUCr)',
    r'{\textdegree}': u'°' ,
    r'{\textquotesingle}': u"'",
    r'\textemdash': u'—',
    r'\textendash': u'–',
    r'St\u0e23\u0e16ffler': u'Stoffler',
    r'{\'{a}}': 'a'
}


bot = BibBot()


[docs]class FillFromDOI(XMu):
    """Fill out skeleton bibliography records that have DOIs"""

    def __init__(self, *args, **kwargs):
        super(FillFromDOI, self).__init__(*args, **kwargs)
        self.records = []
        self.errors = []


[docs]    def iterate(self, element):
        """Pulls reference information from BibTeX based on DOI in EMu record"""
        rec = self.parse(element)
        doi = rec.get_guid('DOI')
        # Check for DOIs in the notes field if not found in the GUID table
        note = rec('NotNotes')
        ris = None
        if not doi and 'DO' in note:
            ris = ris2dict(split_records(note)[0])
            doi = ris.get('DO')
            if doi:
                rec['AdmGUIDType_tab'] = 'DOI'
                rec['AdmGUIDValue_tab'] = clean_doi(doi)
        if doi:
            if 'bhl.title' in doi:
                raise ValueError('BHL DOIs are forbidden: {}'.format(doi))
            elif '/PANGAEA.' in doi:
                raise ValueError('PANGAEA DOIs are forbidden: {}'.format(doi))
            elif '/10.4095/' in doi:
                raise ValueError('FastLink DOIS are forbidden: {}'.format(doi))
            try:
                bibtex = doi2bib(doi)
            except ValueError as e:
                logging.exception('doi')
                bibtex = None
            if bibtex is not None:
                try:
                    formatted = emuize(parse_bibtex(bibtex))
                except ValueError as e:
                    logging.exception('doi')
                    self.errors.append(e)
                formatted['irn'] = rec('irn')
                # Remove DOIs, since these already exist in the source
                # record and the existing values are already cased properly.
                # DOIs found in the notes field are excepted.
                if ris is None:
                    del formatted['AdmGUIDType_tab']
                    del formatted['AdmGUIDValue_tab']
                formatted['NotNotes'] = bibtex
                self.records.append(formatted)


[docs]def doi2emu(fp):
    """Parses BibTeX data for a DOI found in an ebibliography export"""
    bib = FillFromDOI(fp, container=BiblioRecord)
    bib.fast_iter(report=10)
    return bib.records


[docs]def doi2bib(doi):
    """Returns a bibTeX string of metadata for a given DOI.

    Source: https://gist.github.com/jrsmith3/5513926

    Args:
        doi (str): a valid DOI corresponding to a publication

    Returns:
        BibTeX record as a string
    """
    url = requests.compat.urljoin('http://dx.doi.org/', doi)
    print 'Checking {}...'.format(url)
    headers = {'accept': 'application/x-bibtex'}
    response = bot.get(url, headers=headers)
    if response.text.startswith('@'):
        return response.text
    else:
        raise ValueError('  ERROR: Could not resolve {}'.format(doi))
    return None


[docs]def parse_bibtex(bib):
    """Parses the BibTeX returned by the DOI resolver

    Args:
        bib (str): a BibTeX record

    Returns:
        Dict containing reference data
    """
    for entity, repl in ENTITIES.iteritems():
        bib = bib.replace(entity, repl)
    # Parse BibTeX using the handy dandy bibtexparser module
    import bibtexparser
    from bibtexparser.bparser import BibTexParser
    from bibtexparser.customization import convert_to_unicode
    parser = BibTexParser()
    parser.customization = convert_to_unicode
    parsed = bibtexparser.loads(bib, parser=parser).entries[0]
    # Miscellaneous clean up
    braces = re.compile(ur'\{([A-z_ \-]+|[\u0020-\uD7FF])\}', re.U)
    for key, val in parsed.iteritems():
        val = braces.sub(r'\1', val)
        if '{' in val:
            raise Exception('Unhandled LaTeX: {}'.format(val.encode('cp1252')))
        parsed[key] = val
    parsed['pages'] = parsed.get('pages', '').replace('--', '-')
    if parsed.get('publisher', '').endswith(')'):
        parsed['publisher'] = parsed['publisher'].rsplit('(', 1)[0].rstrip()
    #pp.pprint(parsed)
    return parsed


[docs]def parse_authors(author_string, parse_names=True):
    """Parse a list of authors into components used by EMu

    Args:
        author_string (str): a string with one or more authors
        parse (bool): if True, parse names into components

    Returns:
        A list of the parsed authors
    """
    authors = re.split(r',| & | and ', author_string)
    parsed = []
    for author in authors:
        author = author.replace('.', '. ').replace('  ', ' ')
        if parse_names:
            fn = HumanName(author)
            parsed.append(clone({
                'NamTitle': fn.title,
                'NamFirst': fn.first,
                'NamMiddle': fn.middle,
                'NamLast': fn.last,
                'NamSuffix': fn.suffix,
                'SecRecordStatus': 'Unlisted'
            }))
        else:
            parsed.append(author)
    return parsed


[docs]def emuize(data):
    """Convert a BibTex record into an EMu record

    Args:
        data (dict): a parsed BibTeXt record

    Returns:
        A DeepDict object formatted for EMu
    """
    rec = clone()
    kind = data.pop('ENTRYTYPE')
    try:
        prefix = PUB_TYPES[kind]
    except KeyError:
        pp.pprint(data)
        raise Exception('Unrecognized publication type: {}'.format(kind))
    # Authors
    try:
        authors = parse_authors(data.pop('author'))
    except KeyError:
        pass
    else:
        # Special handling for authors of theses/dissertations
        if prefix == 'The':
            rec[prefix + 'AuthorsRef'] = authors[0]
        else:
            rec[prefix + 'AuthorsRef_tab'] = authors
            rec[prefix + 'Role_tab'] = ['Author'] * len(authors)
    # Editors
    try:
        editors = parse_authors(data.pop('editor'))
    except KeyError:
        pass
    else:
        rec[prefix + 'AuthorsRef_tab'] = editors
        rec[prefix + 'Role_tab'] = ['Editor'] * len(editors)
    # Article title
    try:
        rec[prefix + 'Title'] = data.pop('title')
    except KeyError:
        rec[prefix + 'Title'] = '[MISSING TITLE]'
    # Periodical information
    try:
        rec[prefix + 'Volume'] = data.pop('volume')
    except KeyError:
        pass
    try:
        rec[prefix + 'Issue'] = data.pop('number')
    except KeyError:
        pass
    try:
        pages = data.pop('pages')
    except KeyError:
        pass
    else:
        pages = '-'.join([s for s in pages.split('-') if s])
        rec[prefix + 'Pages'] = pages
        rec[prefix + 'IssuePages'] = pages
    # Publication date
    century = None
    try:
        year = data.pop('year')
    except KeyError:
        year = u''
    else:
        # HACK: Part 1 of fix for dates before 1900
        if int(year[:2]) < 19:
            century = year[:2]
            year = '19' + year[2:]
    try:
        month = data.pop('month')
    except KeyError:
        month = u''
    date = parse(' '.join([month, year]))
    if date is not None:
        if month:
            nominal_date = actual_date = date.strftime('%b %Y')
            #actual_date = date.strftime('%m-%Y')
        else:
            nominal_date = actual_date = year
        # HACK: Part 2 of fix for dates before 1900
        if century is not None:
            nominal_date = nominal_date.replace('19', century, 1)
            actual_date = actual_date.replace('19', century, 1)
        rec[prefix + 'PublicationDates'] = nominal_date
        rec[prefix + 'PublicationDate'] = actual_date
    # DOI
    try:
        doi = clean_doi(data.pop('doi'))
    except KeyError:
        pass
    else:
        rec['AdmGUIDValue_tab'] = [doi]
        rec['AdmGUIDType_tab'] = ['DOI']
    # Source
    parent = clone()
    for source in ('Book', 'Journal'):
        try:
            source_title = data.pop(SOURCES[source])
        except KeyError:
            pass
        else:
            source_kind = source[:3]
            parent['BibRecordType'] = source
            parent[source_kind + 'Title'] = source_title
            try:
                publisher = data.pop('publisher')
            except KeyError:
                pass
            else:
                parent[source_kind + 'PublishedByRef'] = clone({
                    'NamPartyType' : 'Organization',
                    'NamInstitution': '',
                    'NamOrganisation' : publisher
                })
            rec[prefix + 'ParentRef'] = parent.expand()
            break
    else:
        pp.pprint(data)
        raise Exception('Unrecognized parent publication')
    # Notes
    try:
        url = data.pop('url')
    except KeyError:
        pass
    else:
        today = datetime.now().strftime('%Y-%m-%d')
        rec['NotNotes'] = u'Data retrieved from {} on {}'.format(url, today)
    # Fields we're not interested in at present
    data.pop('link', None)
    # Multimedia
    fp = os.path.abspath(os.path.join('files', data.pop('ID') + '.pdf'))
    try:
        open(fp, 'rb')
    except IOError:
        pass
    else:
        multimedia = clone({
            'Multimedia': fp,
            'MulTitle': rec[prefix + 'Title'],
            'MulCreator_tab': ['Adam Mansur'],#fullnames,
            'DetResourceType': u'Publication/Manuscript',
            'DetCollectionName_tab': ['Documents and data (Mineral Sciences)'],
            'DetPublisher': publisher,
            'AdmPublishWebNoPassword': 'No',
            'AdmPublishWebPassword': 'No',
            'AdmGUIDType_tab': rec['AdmGUIDType_tab'],
            'AdmGUIDValue_tab': rec['AdmGUIDValue_tab']
        })
        rec['MulMultiMediaRef_tab'] = [multimedia.expand()]
    rec.expand()
    # Look for keys that haven't been cross-walked to EMu schema
    if data:
        pp.pprint(data)
        raise Exception('Unhandled keys: {}'.format(sorted(data.keys())))
    #rec.pprint()
    return rec


[docs]def clone(*args):
    """Creates new record with key attributes copied from global scope"""
    container = BiblioRecord(*args)
    container.fields = FIELDS
    container.module = MODULE
    return container


[docs]def clean_doi(doi):
    prefix = '10.'
    if not doi.startswith(prefix):
        print 'WARNING: DOI looks funny: {}'.format(doi)
        doi = '{}{}'.format(prefix, doi.split(prefix)[0])
    return doi


[docs]def process_file(fp):
    """Create an EMu import file from a list of DOIs

    Args:
        fp (str): the path to the list of DOIs
    """
    records = []
    updated = []
    rename = []
    with io.open(fp, 'r', encoding='utf16') as f:
        rows = csv.DictReader(f, delimiter=',', quotechar='"')
        for ref in rows:
            bib = parse_bibtex(doi2bib(ref['DOI']))
            rec = emuize(bib.copy())
            if rec is not None:
                records.append(rec)
                # Update filename to match the id from the BibTeX record
                fn = ref['Filename']
                if fn:
                    ext = os.path.splitext(fn)[1]
                    ref['Filename'] = bib['ID'] + ext
                    src = os.path.join('files', fn)
                    dst = os.path.join('files', ref['Filename'])
                    if src != dst:
                        rename.append((src, dst))
            updated.append(ref)
    for src, dst in rename:
        os.rename(src, dst)
    write('import.xml', records, 'ebibliography')
    # Update the DOI file
    keys = ['DOI', 'Filename', 'IRN']
    with open('doi.txt', 'wb') as f:
        writer = csv.writer(f, delimiter=',', quotechar='"')
        writer.writerow([s.encode('utf-8') for s in keys])
        for ref in updated:
            writer.writerow([ref[key].encode('utf-8') for key in keys])
    # Re-encode the DOI file to UTF-16-LE
    with open('doi.txt', 'rb') as f:
        data = f.read().decode('utf-8')
    with io.open('doi.txt', 'w', encoding='utf-16', newline='\n') as f:
        f.write(data)