Source code for paper_firehose.core.doi_utils
"""Unified DOI extraction utilities.
Consolidates DOI extraction logic from database.py and abstracts.py into a
single, well-tested implementation.
"""
import re
import json
from typing import Optional, Dict, Any
# DOI regex from Crossref guidelines (simplified)
DOI_PATTERN = re.compile(r"10\.\d{4,9}/[-._;()/:A-Za-z0-9]+", re.IGNORECASE)
[docs]
def find_doi_in_text(text: Optional[str]) -> Optional[str]:
"""Search a text string for a DOI pattern.
Strips common prefixes like 'doi:' before searching.
Args:
text: Text to search for DOI
Returns:
DOI string if found, None otherwise
Examples:
>>> find_doi_in_text("doi:10.1234/example")
'10.1234/example'
>>> find_doi_in_text("https://doi.org/10.1234/example")
'10.1234/example'
>>> find_doi_in_text("no doi here")
None
"""
if not text:
return None
text = str(text).strip()
# Strip common prefixes
if text.lower().startswith('doi:'):
text = text[4:].strip()
# Search for DOI pattern
match = DOI_PATTERN.search(text)
return match.group(0) if match else None
[docs]
def extract_doi_from_entry(entry: Dict[str, Any]) -> Optional[str]:
"""Extract DOI from a feed entry dictionary.
Searches multiple common fields where DOIs appear in RSS/Atom feeds,
including Dublin Core, PRISM, and standard RSS fields.
Args:
entry: Feed entry dictionary (from feedparser or similar)
Returns:
DOI string if found, None otherwise
Field priority order:
1. Direct DOI fields (doi, dc_identifier, prism:doi, etc.)
2. ID and link fields
3. Summary/description fields
4. Content arrays
5. Links arrays
"""
if not entry:
return None
# Priority 1: Direct DOI fields
for key in [
'doi',
'dc_identifier', 'dc:identifier', 'dc.identifier', 'dcIdentifier',
'prism:doi', 'prism_doi',
'guid'
]:
value = entry.get(key)
doi = find_doi_in_text(value)
if doi:
return doi
# Priority 2: ID and link fields
for key in ['id', 'link']:
value = entry.get(key)
doi = find_doi_in_text(value)
if doi:
return doi
# Priority 3: Summary/description fields
doi = find_doi_in_text(entry.get('summary'))
if doi:
return doi
# Check summary_detail if present
summary_detail = entry.get('summary_detail') or {}
if isinstance(summary_detail, dict):
doi = find_doi_in_text(summary_detail.get('value'))
if doi:
return doi
# Some feeds use 'description' instead of 'summary'
doi = find_doi_in_text(entry.get('description'))
if doi:
return doi
# Priority 4: Content arrays (check value or content fields)
contents = entry.get('content') or []
if isinstance(contents, list):
for c in contents:
if isinstance(c, dict):
doi = find_doi_in_text(c.get('value') or c.get('content'))
if doi:
return doi
# Priority 5: Links arrays
links = entry.get('links') or []
if isinstance(links, list):
for link in links:
if isinstance(link, dict):
href = link.get('href')
else:
href = str(link)
doi = find_doi_in_text(href)
if doi:
return doi
return None
[docs]
def extract_doi_from_json(raw_json: Optional[str]) -> Optional[str]:
"""Extract DOI from a raw JSON string.
Useful when dealing with stored feed entry JSON payloads.
Args:
raw_json: JSON string containing feed entry data
Returns:
DOI string if found, None otherwise
"""
if not raw_json:
return None
try:
obj = json.loads(raw_json)
except (json.JSONDecodeError, TypeError):
return None
# Use the main extraction function
return extract_doi_from_entry(obj)