Source code for paper_firehose.core.http_client
"""Shared HTTP client with retry logic and rate limiting."""
import time
from typing import Optional, Dict, Any
import requests
[docs]
class RetryableHTTPClient:
"""HTTP client with exponential backoff retry logic and rate limiting.
Handles common failure scenarios (429, 500, 502, 503, 504) with exponential
backoff, respects Retry-After headers, and enforces rate limiting.
Args:
rps: Maximum requests per second (default: 1.0)
max_retries: Maximum number of retry attempts (default: 3)
timeout: Request timeout in seconds (default: 15)
"""
def __init__(self, rps: float = 1.0, max_retries: int = 3, timeout: int = 15):
self.session = requests.Session()
self.rps = rps
self.max_retries = max_retries
self.timeout = timeout
self.min_interval = 1.0 / max(rps, 0.01)
self.last_request_time = 0.0
def _rate_limit(self):
"""Enforce rate limiting between requests."""
now = time.time()
elapsed = now - self.last_request_time
if elapsed < self.min_interval:
time.sleep(self.min_interval - elapsed)
self.last_request_time = time.time()
[docs]
def get_with_retry(
self,
url: str,
headers: Optional[Dict[str, str]] = None,
params: Optional[Dict[str, Any]] = None,
timeout: Optional[int] = None,
return_none_on_404: bool = True
) -> Optional[requests.Response]:
"""Make GET request with exponential backoff retry logic.
Args:
url: URL to fetch
headers: Optional request headers
params: Optional query parameters
timeout: Optional timeout override (uses instance default if None)
return_none_on_404: If True, return None on 404; if False, let it raise
Returns:
Response object on success, None on 404 (if return_none_on_404=True)
Raises:
requests.HTTPError: On non-retryable HTTP errors
requests.RequestException: On network errors after retries exhausted
"""
timeout = timeout or self.timeout
for attempt in range(self.max_retries):
try:
self._rate_limit()
r = self.session.get(url, headers=headers, params=params, timeout=timeout)
# Handle 404 specially
if r.status_code == 404:
if return_none_on_404:
return None
r.raise_for_status()
# Retry on throttling/server errors with exponential backoff
if r.status_code in (429, 500, 502, 503, 504):
wait = self._calculate_backoff_time(r, attempt)
time.sleep(wait)
continue
# Raise on other HTTP errors
r.raise_for_status()
return r
except requests.RequestException as e:
# Network or parsing error → backoff and retry
if attempt < self.max_retries - 1:
wait = min(8.0, 2.0 ** attempt)
time.sleep(wait)
continue
# Last attempt failed, re-raise
raise
# Should not reach here, but just in case
return None
def _calculate_backoff_time(self, response: requests.Response, attempt: int) -> float:
"""Calculate backoff time, respecting Retry-After header if present."""
retry_after = response.headers.get("Retry-After")
if retry_after:
try:
wait = float(retry_after)
return max(wait, 1.0) # At least 1 second
except (ValueError, TypeError):
pass
# Exponential backoff: 1s, 2s, 4s, max 8s
return min(8.0, 2.0 ** attempt)
[docs]
def close(self):
"""Close the underlying session."""
self.session.close()
def __enter__(self):
"""Context manager entry."""
return self
def __exit__(self, exc_type, exc_val, exc_tb):
"""Context manager exit."""
self.close()