# -*- coding: utf-8 -*-
"""Populates ebibliography records based on BibTeX records pulled using DOI"""
import csv
import io
import logging
import os
import pprint as pp
import re
from datetime import datetime
import requests
import requests_cache
from dateparser import parse
from nameparser import HumanName
from .bibbot import BibBot
from .ris import split_records, ris2dict
from ....xmu.constants import FIELDS
from ....xmu import XMu, BiblioRecord, write
MODULE = 'ebibliography'
PUB_TYPES = {
'article': 'Art',
'book': 'Boo',
'incollection': 'Art',
'techreport': 'Art',
'thes': 'The'
}
SOURCES = {
'Book': 'booktitle',
'Journal': 'journal'
}
# List of entities from BibTeX
ENTITIES = {
r'$\mathsemicolon$': u';',
r'{\{AE}}': u'Æ',
r'({IUCr})': u'(IUCr)',
r'{\textdegree}': u'°' ,
r'{\textquotesingle}': u"'",
r'\textemdash': u'—',
r'\textendash': u'–',
r'St\u0e23\u0e16ffler': u'Stoffler',
r'{\'{a}}': 'a'
}
bot = BibBot()
[docs]class FillFromDOI(XMu):
"""Fill out skeleton bibliography records that have DOIs"""
def __init__(self, *args, **kwargs):
super(FillFromDOI, self).__init__(*args, **kwargs)
self.records = []
self.errors = []
[docs] def iterate(self, element):
"""Pulls reference information from BibTeX based on DOI in EMu record"""
rec = self.parse(element)
doi = rec.get_guid('DOI')
# Check for DOIs in the notes field if not found in the GUID table
note = rec('NotNotes')
ris = None
if not doi and 'DO' in note:
ris = ris2dict(split_records(note)[0])
doi = ris.get('DO')
if doi:
rec['AdmGUIDType_tab'] = 'DOI'
rec['AdmGUIDValue_tab'] = clean_doi(doi)
if doi:
if 'bhl.title' in doi:
raise ValueError('BHL DOIs are forbidden: {}'.format(doi))
elif '/PANGAEA.' in doi:
raise ValueError('PANGAEA DOIs are forbidden: {}'.format(doi))
elif '/10.4095/' in doi:
raise ValueError('FastLink DOIS are forbidden: {}'.format(doi))
try:
bibtex = doi2bib(doi)
except ValueError as e:
logging.exception('doi')
bibtex = None
if bibtex is not None:
try:
formatted = emuize(parse_bibtex(bibtex))
except ValueError as e:
logging.exception('doi')
self.errors.append(e)
formatted['irn'] = rec('irn')
# Remove DOIs, since these already exist in the source
# record and the existing values are already cased properly.
# DOIs found in the notes field are excepted.
if ris is None:
del formatted['AdmGUIDType_tab']
del formatted['AdmGUIDValue_tab']
formatted['NotNotes'] = bibtex
self.records.append(formatted)
[docs]def doi2emu(fp):
"""Parses BibTeX data for a DOI found in an ebibliography export"""
bib = FillFromDOI(fp, container=BiblioRecord)
bib.fast_iter(report=10)
return bib.records
[docs]def doi2bib(doi):
"""Returns a bibTeX string of metadata for a given DOI.
Source: https://gist.github.com/jrsmith3/5513926
Args:
doi (str): a valid DOI corresponding to a publication
Returns:
BibTeX record as a string
"""
url = requests.compat.urljoin('http://dx.doi.org/', doi)
print 'Checking {}...'.format(url)
headers = {'accept': 'application/x-bibtex'}
response = bot.get(url, headers=headers)
if response.text.startswith('@'):
return response.text
else:
raise ValueError(' ERROR: Could not resolve {}'.format(doi))
return None
[docs]def parse_bibtex(bib):
"""Parses the BibTeX returned by the DOI resolver
Args:
bib (str): a BibTeX record
Returns:
Dict containing reference data
"""
for entity, repl in ENTITIES.iteritems():
bib = bib.replace(entity, repl)
# Parse BibTeX using the handy dandy bibtexparser module
import bibtexparser
from bibtexparser.bparser import BibTexParser
from bibtexparser.customization import convert_to_unicode
parser = BibTexParser()
parser.customization = convert_to_unicode
parsed = bibtexparser.loads(bib, parser=parser).entries[0]
# Miscellaneous clean up
braces = re.compile(ur'\{([A-z_ \-]+|[\u0020-\uD7FF])\}', re.U)
for key, val in parsed.iteritems():
val = braces.sub(r'\1', val)
if '{' in val:
raise Exception('Unhandled LaTeX: {}'.format(val.encode('cp1252')))
parsed[key] = val
parsed['pages'] = parsed.get('pages', '').replace('--', '-')
if parsed.get('publisher', '').endswith(')'):
parsed['publisher'] = parsed['publisher'].rsplit('(', 1)[0].rstrip()
#pp.pprint(parsed)
return parsed
[docs]def parse_authors(author_string, parse_names=True):
"""Parse a list of authors into components used by EMu
Args:
author_string (str): a string with one or more authors
parse (bool): if True, parse names into components
Returns:
A list of the parsed authors
"""
authors = re.split(r',| & | and ', author_string)
parsed = []
for author in authors:
author = author.replace('.', '. ').replace(' ', ' ')
if parse_names:
fn = HumanName(author)
parsed.append(clone({
'NamTitle': fn.title,
'NamFirst': fn.first,
'NamMiddle': fn.middle,
'NamLast': fn.last,
'NamSuffix': fn.suffix,
'SecRecordStatus': 'Unlisted'
}))
else:
parsed.append(author)
return parsed
[docs]def emuize(data):
"""Convert a BibTex record into an EMu record
Args:
data (dict): a parsed BibTeXt record
Returns:
A DeepDict object formatted for EMu
"""
rec = clone()
kind = data.pop('ENTRYTYPE')
try:
prefix = PUB_TYPES[kind]
except KeyError:
pp.pprint(data)
raise Exception('Unrecognized publication type: {}'.format(kind))
# Authors
try:
authors = parse_authors(data.pop('author'))
except KeyError:
pass
else:
# Special handling for authors of theses/dissertations
if prefix == 'The':
rec[prefix + 'AuthorsRef'] = authors[0]
else:
rec[prefix + 'AuthorsRef_tab'] = authors
rec[prefix + 'Role_tab'] = ['Author'] * len(authors)
# Editors
try:
editors = parse_authors(data.pop('editor'))
except KeyError:
pass
else:
rec[prefix + 'AuthorsRef_tab'] = editors
rec[prefix + 'Role_tab'] = ['Editor'] * len(editors)
# Article title
try:
rec[prefix + 'Title'] = data.pop('title')
except KeyError:
rec[prefix + 'Title'] = '[MISSING TITLE]'
# Periodical information
try:
rec[prefix + 'Volume'] = data.pop('volume')
except KeyError:
pass
try:
rec[prefix + 'Issue'] = data.pop('number')
except KeyError:
pass
try:
pages = data.pop('pages')
except KeyError:
pass
else:
pages = '-'.join([s for s in pages.split('-') if s])
rec[prefix + 'Pages'] = pages
rec[prefix + 'IssuePages'] = pages
# Publication date
century = None
try:
year = data.pop('year')
except KeyError:
year = u''
else:
# HACK: Part 1 of fix for dates before 1900
if int(year[:2]) < 19:
century = year[:2]
year = '19' + year[2:]
try:
month = data.pop('month')
except KeyError:
month = u''
date = parse(' '.join([month, year]))
if date is not None:
if month:
nominal_date = actual_date = date.strftime('%b %Y')
#actual_date = date.strftime('%m-%Y')
else:
nominal_date = actual_date = year
# HACK: Part 2 of fix for dates before 1900
if century is not None:
nominal_date = nominal_date.replace('19', century, 1)
actual_date = actual_date.replace('19', century, 1)
rec[prefix + 'PublicationDates'] = nominal_date
rec[prefix + 'PublicationDate'] = actual_date
# DOI
try:
doi = clean_doi(data.pop('doi'))
except KeyError:
pass
else:
rec['AdmGUIDValue_tab'] = [doi]
rec['AdmGUIDType_tab'] = ['DOI']
# Source
parent = clone()
for source in ('Book', 'Journal'):
try:
source_title = data.pop(SOURCES[source])
except KeyError:
pass
else:
source_kind = source[:3]
parent['BibRecordType'] = source
parent[source_kind + 'Title'] = source_title
try:
publisher = data.pop('publisher')
except KeyError:
pass
else:
parent[source_kind + 'PublishedByRef'] = clone({
'NamPartyType' : 'Organization',
'NamInstitution': '',
'NamOrganisation' : publisher
})
rec[prefix + 'ParentRef'] = parent.expand()
break
else:
pp.pprint(data)
raise Exception('Unrecognized parent publication')
# Notes
try:
url = data.pop('url')
except KeyError:
pass
else:
today = datetime.now().strftime('%Y-%m-%d')
rec['NotNotes'] = u'Data retrieved from {} on {}'.format(url, today)
# Fields we're not interested in at present
data.pop('link', None)
# Multimedia
fp = os.path.abspath(os.path.join('files', data.pop('ID') + '.pdf'))
try:
open(fp, 'rb')
except IOError:
pass
else:
multimedia = clone({
'Multimedia': fp,
'MulTitle': rec[prefix + 'Title'],
'MulCreator_tab': ['Adam Mansur'],#fullnames,
'DetResourceType': u'Publication/Manuscript',
'DetCollectionName_tab': ['Documents and data (Mineral Sciences)'],
'DetPublisher': publisher,
'AdmPublishWebNoPassword': 'No',
'AdmPublishWebPassword': 'No',
'AdmGUIDType_tab': rec['AdmGUIDType_tab'],
'AdmGUIDValue_tab': rec['AdmGUIDValue_tab']
})
rec['MulMultiMediaRef_tab'] = [multimedia.expand()]
rec.expand()
# Look for keys that haven't been cross-walked to EMu schema
if data:
pp.pprint(data)
raise Exception('Unhandled keys: {}'.format(sorted(data.keys())))
#rec.pprint()
return rec
[docs]def clone(*args):
"""Creates new record with key attributes copied from global scope"""
container = BiblioRecord(*args)
container.fields = FIELDS
container.module = MODULE
return container
[docs]def clean_doi(doi):
prefix = '10.'
if not doi.startswith(prefix):
print 'WARNING: DOI looks funny: {}'.format(doi)
doi = '{}{}'.format(prefix, doi.split(prefix)[0])
return doi
[docs]def process_file(fp):
"""Create an EMu import file from a list of DOIs
Args:
fp (str): the path to the list of DOIs
"""
records = []
updated = []
rename = []
with io.open(fp, 'r', encoding='utf16') as f:
rows = csv.DictReader(f, delimiter=',', quotechar='"')
for ref in rows:
bib = parse_bibtex(doi2bib(ref['DOI']))
rec = emuize(bib.copy())
if rec is not None:
records.append(rec)
# Update filename to match the id from the BibTeX record
fn = ref['Filename']
if fn:
ext = os.path.splitext(fn)[1]
ref['Filename'] = bib['ID'] + ext
src = os.path.join('files', fn)
dst = os.path.join('files', ref['Filename'])
if src != dst:
rename.append((src, dst))
updated.append(ref)
for src, dst in rename:
os.rename(src, dst)
write('import.xml', records, 'ebibliography')
# Update the DOI file
keys = ['DOI', 'Filename', 'IRN']
with open('doi.txt', 'wb') as f:
writer = csv.writer(f, delimiter=',', quotechar='"')
writer.writerow([s.encode('utf-8') for s in keys])
for ref in updated:
writer.writerow([ref[key].encode('utf-8') for key in keys])
# Re-encode the DOI file to UTF-16-LE
with open('doi.txt', 'rb') as f:
data = f.read().decode('utf-8')
with io.open('doi.txt', 'w', encoding='utf-16', newline='\n') as f:
f.write(data)