Source code for paper_firehose.core.apis.openalex_client

"""
OpenAlex API client for fetching paper abstracts.

OpenAlex is an open catalog of scholarly papers that provides metadata including
abstracts, sometimes in an inverted-index format that needs reconstruction.
"""

from __future__ import annotations

import json
from urllib.parse import quote
from typing import Optional, Dict, Any

import requests

from ..http_client import RetryableHTTPClient
from ..text_utils import strip_jats


def _reconstruct_openalex(ii: Dict[str, Any]) -> Optional[str]:
    """Reassemble OpenAlex's inverted-index abstract representation.

    Args:
        ii: Inverted index dictionary mapping words to position lists

    Returns:
        Reconstructed abstract text or None if reconstruction fails
    """
    try:
        idx_pairs = []
        max_pos = -1
        for word, positions in ii.items():
            for p in positions:
                if p > max_pos:
                    max_pos = p
                idx_pairs.append((p, word))
        if max_pos < 0:
            return None
        arr = [None] * (max_pos + 1)
        for pos, word in idx_pairs:
            arr[pos] = word
        return ' '.join(w for w in arr if w)
    except Exception:
        return None


[docs] def get_openalex_abstract( doi: str, *, mailto: str, session: Optional[requests.Session] = None ) -> Optional[str]: """Fetch an abstract from OpenAlex by DOI, reconstructing when inverted-indexed. Args: doi: Digital Object Identifier to look up mailto: Contact email for OpenAlex User-Agent session: Optional requests.Session for backward compatibility Returns: Plain-text abstract or None if not available """ if not doi: return None url = f"https://api.openalex.org/works/https://doi.org/{quote(doi)}?mailto={quote(mailto)}" # If session is provided, use old logic for compatibility if session: try: r = session.get(url, timeout=15) if r.status_code == 404: return None r.raise_for_status() data = r.json() abs_txt = data.get('abstract') if abs_txt: return strip_jats(abs_txt) ii = data.get('abstract_inverted_index') if ii: return _reconstruct_openalex(ii) return None except (requests.RequestException, json.JSONDecodeError, KeyError): return None # Use new RetryableHTTPClient for better retry logic try: client = RetryableHTTPClient(rps=1.0, max_retries=3) r = client.get_with_retry(url) if r is None: # 404 case return None data = r.json() abs_txt = data.get('abstract') if abs_txt: return strip_jats(abs_txt) ii = data.get('abstract_inverted_index') if ii: return _reconstruct_openalex(ii) return None except (requests.RequestException, json.JSONDecodeError, KeyError): return None