Source code for paper_firehose.core.apis.pubmed_client
"""
PubMed API client for fetching paper abstracts.
PubMed (NCBI) provides access to biomedical literature abstracts through the
E-utilities API (ESearch and EFetch).
"""
from __future__ import annotations
from typing import Optional
import xml.etree.ElementTree as ET
import requests
from ..http_client import RetryableHTTPClient
from ..text_utils import strip_jats
[docs]
def get_pubmed_abstract_by_doi(
doi: str,
*,
session: Optional[requests.Session] = None
) -> Optional[str]:
"""Look up a DOI in PubMed and return the combined abstract text if available.
Uses ESearch to find PMID by DOI, then EFetch to retrieve the abstract XML.
Args:
doi: Digital Object Identifier to look up
session: Optional requests.Session for backward compatibility
Returns:
Plain-text abstract or None if not available
"""
if not doi:
return None
# If session is provided, use old logic for compatibility
if session:
try:
# ESearch for PMID by DOI
es = session.get(
"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi",
params={"db": "pubmed", "term": f"{doi}[DOI]", "retmode": "json"},
timeout=15,
)
es.raise_for_status()
idlist = (es.json().get('esearchresult') or {}).get('idlist') or []
if not idlist:
return None
pmid = idlist[0]
# EFetch to get abstract XML
ef = session.get(
"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi",
params={"db": "pubmed", "id": pmid, "retmode": "xml"},
timeout=15,
)
ef.raise_for_status()
root = ET.fromstring(ef.text)
texts = []
for at in root.findall('.//AbstractText'):
texts.append(''.join(at.itertext()).strip())
return strip_jats(' '.join(t for t in texts if t)) if texts else None
except (requests.RequestException, ET.ParseError, KeyError):
return None
# Use new RetryableHTTPClient for better retry logic
try:
client = RetryableHTTPClient(rps=0.33, max_retries=3) # PubMed rate limit: 3 req/sec
# ESearch for PMID by DOI
es = client.get_with_retry(
"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi",
params={"db": "pubmed", "term": f"{doi}[DOI]", "retmode": "json"}
)
if es is None:
return None
idlist = (es.json().get('esearchresult') or {}).get('idlist') or []
if not idlist:
return None
pmid = idlist[0]
# EFetch to get abstract XML
ef = client.get_with_retry(
"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi",
params={"db": "pubmed", "id": pmid, "retmode": "xml"}
)
if ef is None:
return None
root = ET.fromstring(ef.text)
texts = []
for at in root.findall('.//AbstractText'):
texts.append(''.join(at.itertext()).strip())
return strip_jats(' '.join(t for t in texts if t)) if texts else None
except (requests.RequestException, ET.ParseError, KeyError):
return None