Source code for paper_firehose.core.apis.crossref_client
"""
Crossref API client for fetching paper abstracts.
Crossref is a major DOI registration agency with comprehensive metadata
including abstracts for academic publications.
"""
from __future__ import annotations
import json
import time
from urllib.parse import quote
from typing import Optional
import requests
from ..http_client import RetryableHTTPClient
from ..text_utils import strip_jats
CROSSREF_API = "https://api.crossref.org/works/"
[docs]
def get_crossref_abstract(
doi: str,
*,
mailto: str,
max_retries: int = 3,
session: Optional[requests.Session] = None
) -> Optional[str]:
"""Return the plain-text abstract for DOI or None if not available.
Implements exponential backoff on 429/5xx and honors Retry-After when present.
Also sends Crossref the mailto parameter.
Args:
doi: Digital Object Identifier to look up
mailto: Contact email for Crossref User-Agent
max_retries: Maximum number of retry attempts (default: 3)
session: Optional requests.Session for backward compatibility
Returns:
Plain-text abstract or None if not available
"""
# If session is provided, use old logic for compatibility
if session:
url = f"{CROSSREF_API}{quote(doi)}?mailto={quote(mailto)}"
headers = {
"User-Agent": f"paper-firehose/abstract-fetcher (mailto:{mailto})"
}
for attempt in range(max_retries):
try:
r = session.get(url, headers=headers, timeout=15)
if r.status_code == 404:
return None
if r.status_code in (429, 500, 502, 503, 504):
ra = r.headers.get("Retry-After")
if ra:
try:
wait = float(ra)
except (ValueError, TypeError):
wait = 1.0
else:
wait = min(8.0, 2.0 ** attempt)
time.sleep(wait if wait > 0 else 1.0)
continue
r.raise_for_status()
data = r.json()
msg = data.get("message", {})
abstract = msg.get("abstract")
if abstract:
return strip_jats(abstract) or None
return None
except (requests.RequestException, json.JSONDecodeError, KeyError):
time.sleep(min(8.0, 2.0 ** attempt))
continue
return None
# Use new RetryableHTTPClient for better retry logic
url = f"{CROSSREF_API}{quote(doi)}?mailto={quote(mailto)}"
headers = {
"User-Agent": f"paper-firehose/abstract-fetcher (mailto:{mailto})"
}
try:
client = RetryableHTTPClient(rps=1.0, max_retries=max_retries)
r = client.get_with_retry(url, headers=headers)
if r is None: # 404 case
return None
data = r.json()
msg = data.get("message", {})
abstract = msg.get("abstract")
if abstract:
return strip_jats(abstract) or None
return None
except (requests.RequestException, json.JSONDecodeError, KeyError):
return None
[docs]
def search_crossref_abstract_by_title(
title: str,
*,
mailto: str,
max_retries: int = 2,
session: Optional[requests.Session] = None
) -> Optional[str]:
"""Best-effort abstract lookup by title when DOI is missing or returns no abstract.
Uses Crossref's works search endpoint with a bibliographic query. Returns the
first item's abstract if available.
Args:
title: Paper title to search for
mailto: Contact email for Crossref User-Agent
max_retries: Maximum number of retry attempts (default: 2)
session: Optional requests.Session for backward compatibility
Returns:
Plain-text abstract or None if not available
"""
if not title:
return None
# If session is provided, use old logic for compatibility
if session:
base = "https://api.crossref.org/works"
params = f"?query.bibliographic={quote(title)}&rows=1&mailto={quote(mailto)}"
url = base + params
headers = {
"User-Agent": f"paper-firehose/abstract-fetcher (mailto:{mailto})"
}
for attempt in range(max_retries):
try:
r = session.get(url, headers=headers, timeout=15)
if r.status_code == 404:
return None
if r.status_code in (429, 500, 502, 503, 504):
ra = r.headers.get("Retry-After")
if ra:
try:
wait = float(ra)
except (ValueError, TypeError):
wait = 1.0
else:
wait = min(8.0, 2.0 ** attempt)
time.sleep(wait if wait > 0 else 1.0)
continue
r.raise_for_status()
data = r.json()
items = (data.get('message') or {}).get('items') or []
if items:
abstract = items[0].get('abstract')
if abstract:
return strip_jats(abstract) or None
return None
except (requests.RequestException, json.JSONDecodeError, KeyError):
time.sleep(min(8.0, 2.0 ** attempt))
continue
return None
# Use new RetryableHTTPClient for better retry logic
base = "https://api.crossref.org/works"
params = f"?query.bibliographic={quote(title)}&rows=1&mailto={quote(mailto)}"
url = base + params
headers = {
"User-Agent": f"paper-firehose/abstract-fetcher (mailto:{mailto})"
}
try:
client = RetryableHTTPClient(rps=1.0, max_retries=max_retries)
r = client.get_with_retry(url, headers=headers)
if r is None: # 404 case
return None
data = r.json()
items = (data.get('message') or {}).get('items') or []
if items:
abstract = items[0].get('abstract')
if abstract:
return strip_jats(abstract) or None
return None
except (requests.RequestException, json.JSONDecodeError, KeyError):
return None