Source code for paper_firehose.core.apis.crossref_client

"""
Crossref API client for fetching paper abstracts.

Crossref is a major DOI registration agency with comprehensive metadata
including abstracts for academic publications.
"""

from __future__ import annotations

import json
import time
from urllib.parse import quote
from typing import Optional

import requests

from ..http_client import RetryableHTTPClient
from ..text_utils import strip_jats


CROSSREF_API = "https://api.crossref.org/works/"



[docs]
def get_crossref_abstract(
    doi: str,
    *,
    mailto: str,
    max_retries: int = 3,
    session: Optional[requests.Session] = None
) -> Optional[str]:
    """Return the plain-text abstract for DOI or None if not available.

    Implements exponential backoff on 429/5xx and honors Retry-After when present.
    Also sends Crossref the mailto parameter.

    Args:
        doi: Digital Object Identifier to look up
        mailto: Contact email for Crossref User-Agent
        max_retries: Maximum number of retry attempts (default: 3)
        session: Optional requests.Session for backward compatibility

    Returns:
        Plain-text abstract or None if not available
    """
    # If session is provided, use old logic for compatibility
    if session:
        url = f"{CROSSREF_API}{quote(doi)}?mailto={quote(mailto)}"
        headers = {
            "User-Agent": f"paper-firehose/abstract-fetcher (mailto:{mailto})"
        }
        for attempt in range(max_retries):
            try:
                r = session.get(url, headers=headers, timeout=15)
                if r.status_code == 404:
                    return None
                if r.status_code in (429, 500, 502, 503, 504):
                    ra = r.headers.get("Retry-After")
                    if ra:
                        try:
                            wait = float(ra)
                        except (ValueError, TypeError):
                            wait = 1.0
                    else:
                        wait = min(8.0, 2.0 ** attempt)
                    time.sleep(wait if wait > 0 else 1.0)
                    continue
                r.raise_for_status()
                data = r.json()
                msg = data.get("message", {})
                abstract = msg.get("abstract")
                if abstract:
                    return strip_jats(abstract) or None
                return None
            except (requests.RequestException, json.JSONDecodeError, KeyError):
                time.sleep(min(8.0, 2.0 ** attempt))
                continue
        return None

    # Use new RetryableHTTPClient for better retry logic
    url = f"{CROSSREF_API}{quote(doi)}?mailto={quote(mailto)}"
    headers = {
        "User-Agent": f"paper-firehose/abstract-fetcher (mailto:{mailto})"
    }

    try:
        client = RetryableHTTPClient(rps=1.0, max_retries=max_retries)
        r = client.get_with_retry(url, headers=headers)
        if r is None:  # 404 case
            return None

        data = r.json()
        msg = data.get("message", {})
        abstract = msg.get("abstract")
        if abstract:
            return strip_jats(abstract) or None
        return None
    except (requests.RequestException, json.JSONDecodeError, KeyError):
        return None




[docs]
def search_crossref_abstract_by_title(
    title: str,
    *,
    mailto: str,
    max_retries: int = 2,
    session: Optional[requests.Session] = None
) -> Optional[str]:
    """Best-effort abstract lookup by title when DOI is missing or returns no abstract.

    Uses Crossref's works search endpoint with a bibliographic query. Returns the
    first item's abstract if available.

    Args:
        title: Paper title to search for
        mailto: Contact email for Crossref User-Agent
        max_retries: Maximum number of retry attempts (default: 2)
        session: Optional requests.Session for backward compatibility

    Returns:
        Plain-text abstract or None if not available
    """
    if not title:
        return None

    # If session is provided, use old logic for compatibility
    if session:
        base = "https://api.crossref.org/works"
        params = f"?query.bibliographic={quote(title)}&rows=1&mailto={quote(mailto)}"
        url = base + params
        headers = {
            "User-Agent": f"paper-firehose/abstract-fetcher (mailto:{mailto})"
        }
        for attempt in range(max_retries):
            try:
                r = session.get(url, headers=headers, timeout=15)
                if r.status_code == 404:
                    return None
                if r.status_code in (429, 500, 502, 503, 504):
                    ra = r.headers.get("Retry-After")
                    if ra:
                        try:
                            wait = float(ra)
                        except (ValueError, TypeError):
                            wait = 1.0
                    else:
                        wait = min(8.0, 2.0 ** attempt)
                    time.sleep(wait if wait > 0 else 1.0)
                    continue
                r.raise_for_status()
                data = r.json()
                items = (data.get('message') or {}).get('items') or []
                if items:
                    abstract = items[0].get('abstract')
                    if abstract:
                        return strip_jats(abstract) or None
                return None
            except (requests.RequestException, json.JSONDecodeError, KeyError):
                time.sleep(min(8.0, 2.0 ** attempt))
                continue
        return None

    # Use new RetryableHTTPClient for better retry logic
    base = "https://api.crossref.org/works"
    params = f"?query.bibliographic={quote(title)}&rows=1&mailto={quote(mailto)}"
    url = base + params
    headers = {
        "User-Agent": f"paper-firehose/abstract-fetcher (mailto:{mailto})"
    }

    try:
        client = RetryableHTTPClient(rps=1.0, max_retries=max_retries)
        r = client.get_with_retry(url, headers=headers)
        if r is None:  # 404 case
            return None

        data = r.json()
        items = (data.get('message') or {}).get('items') or []
        if items:
            abstract = items[0].get('abstract')
            if abstract:
                return strip_jats(abstract) or None
        return None
    except (requests.RequestException, json.JSONDecodeError, KeyError):
        return None