Source code for paper_firehose.core.apis.pubmed_client

"""
PubMed API client for fetching paper abstracts.

PubMed (NCBI) provides access to biomedical literature abstracts through the
E-utilities API (ESearch and EFetch).
"""

from __future__ import annotations

from typing import Optional
import xml.etree.ElementTree as ET

import requests

from ..http_client import RetryableHTTPClient
from ..text_utils import strip_jats



[docs]
def get_pubmed_abstract_by_doi(
    doi: str,
    *,
    session: Optional[requests.Session] = None
) -> Optional[str]:
    """Look up a DOI in PubMed and return the combined abstract text if available.

    Uses ESearch to find PMID by DOI, then EFetch to retrieve the abstract XML.

    Args:
        doi: Digital Object Identifier to look up
        session: Optional requests.Session for backward compatibility

    Returns:
        Plain-text abstract or None if not available
    """
    if not doi:
        return None

    # If session is provided, use old logic for compatibility
    if session:
        try:
            # ESearch for PMID by DOI
            es = session.get(
                "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi",
                params={"db": "pubmed", "term": f"{doi}[DOI]", "retmode": "json"},
                timeout=15,
            )
            es.raise_for_status()
            idlist = (es.json().get('esearchresult') or {}).get('idlist') or []
            if not idlist:
                return None
            pmid = idlist[0]
            # EFetch to get abstract XML
            ef = session.get(
                "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi",
                params={"db": "pubmed", "id": pmid, "retmode": "xml"},
                timeout=15,
            )
            ef.raise_for_status()
            root = ET.fromstring(ef.text)
            texts = []
            for at in root.findall('.//AbstractText'):
                texts.append(''.join(at.itertext()).strip())
            return strip_jats(' '.join(t for t in texts if t)) if texts else None
        except (requests.RequestException, ET.ParseError, KeyError):
            return None

    # Use new RetryableHTTPClient for better retry logic
    try:
        client = RetryableHTTPClient(rps=0.33, max_retries=3)  # PubMed rate limit: 3 req/sec

        # ESearch for PMID by DOI
        es = client.get_with_retry(
            "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi",
            params={"db": "pubmed", "term": f"{doi}[DOI]", "retmode": "json"}
        )
        if es is None:
            return None

        idlist = (es.json().get('esearchresult') or {}).get('idlist') or []
        if not idlist:
            return None
        pmid = idlist[0]

        # EFetch to get abstract XML
        ef = client.get_with_retry(
            "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi",
            params={"db": "pubmed", "id": pmid, "retmode": "xml"}
        )
        if ef is None:
            return None

        root = ET.fromstring(ef.text)
        texts = []
        for at in root.findall('.//AbstractText'):
            texts.append(''.join(at.itertext()).strip())
        return strip_jats(' '.join(t for t in texts if t)) if texts else None
    except (requests.RequestException, ET.ParseError, KeyError):
        return None