from __future__ import annotations
import logging
import os
from importlib.metadata import version as _get_version, PackageNotFoundError
from pathlib import Path
from typing import Any, Dict, List, Optional
# Version from package metadata (defined in pyproject.toml)
try:
__version__ = _get_version("paper_firehose")
except PackageNotFoundError:
__version__ = "0.0.0.dev" # Fallback for editable installs without metadata
from .commands import filter as filter_cmd
from .commands import rank as rank_cmd
from .commands import abstracts as abstracts_cmd
from .commands import pqa_summary as pqa_summary_cmd
from .commands import email_list as email_cmd
from .commands import export_recent as export_recent_cmd
from .commands import query as query_cmd
from .core.config import ConfigManager, DEFAULT_CONFIG_PATH
from .core.database import DatabaseManager
from .core.paths import resolve_data_path
from .processors.html_generator import HTMLGenerator
logger = logging.getLogger(__name__)
_DEFAULT_CONFIG = str(DEFAULT_CONFIG_PATH)
__all__ = [
'__version__',
'filter',
'rank',
'abstracts',
'pqa_summary',
'paperqa_summary',
'email',
'purge',
'status',
'html',
'generate_html',
'export_recent',
'query',
]
def _resolve_output_path(path: str) -> Path:
"""Resolve HTML output paths under the runtime data directory."""
candidate = Path(path)
if candidate.is_absolute():
candidate.parent.mkdir(parents=True, exist_ok=True)
return candidate
return resolve_data_path('html', *candidate.parts, ensure_parent=True)
[docs]
def filter(topic: Optional[str] = None, config_path: Optional[str] = None) -> None:
"""Run the filter step programmatically.
Args:
topic: Optional topic name to process; if None, process all topics.
config_path: Path to main YAML config; defaults to repo config.
"""
cfg_path = config_path or _DEFAULT_CONFIG
filter_cmd.run(cfg_path, topic)
[docs]
def rank(topic: Optional[str] = None, config_path: Optional[str] = None) -> None:
"""Compute and write rank scores into papers.db for the given topic (or all)."""
cfg_path = config_path or _DEFAULT_CONFIG
rank_cmd.run(cfg_path, topic)
[docs]
def abstracts(
topic: Optional[str] = None,
*,
mailto: Optional[str] = None,
limit: Optional[int] = None,
rps: Optional[float] = None,
config_path: Optional[str] = None,
) -> None:
"""Fetch abstracts for ranked entries and write to papers.db/history.
Args:
topic: Restrict to a single topic (optional)
mailto: Contact email for Crossref UA (optional)
limit: Max abstracts per topic (optional)
rps: Requests/second throttle (optional)
config_path: Path to config (optional)
"""
cfg_path = config_path or _DEFAULT_CONFIG
abstracts_cmd.run(cfg_path, topic, mailto=mailto, max_per_topic=limit, rps=rps or 1.0)
[docs]
def pqa_summary(
topic: Optional[str] = None,
*,
rps: Optional[float] = None,
limit: Optional[int] = None,
arxiv: Optional[List[str]] = None,
entry_ids: Optional[List[str]] = None,
use_history: bool = False,
history_date: Optional[str] = None,
history_feed_like: Optional[str] = None,
config_path: Optional[str] = None,
) -> None:
"""Run the paper-qa pipeline to download PDFs and write grounded summaries.
Args:
topic: Optional topic name to target ranked entries; when omitted and no
IDs are supplied, all configured topics are scanned.
rps: Optional requests-per-second override for arXiv lookups/downloads.
limit: Optional cap on number of ranked entries per topic.
arxiv: Optional list of arXiv IDs/URLs to process directly (bypass ranking).
entry_ids: Optional list of database entry IDs to summarize (history lookup).
use_history: When True, resolve `entry_ids` against the history database.
history_date: Optional YYYY-MM-DD filter when querying history records.
history_feed_like: Optional substring filter for history feed names.
config_path: Path to main YAML config; defaults to repo config.
"""
cfg_path = config_path or _DEFAULT_CONFIG
pqa_summary_cmd.run(
cfg_path,
topic,
rps=rps,
limit=limit,
arxiv=arxiv,
entry_ids=entry_ids,
use_history=use_history,
history_date=history_date,
history_feed_like=history_feed_like,
)
[docs]
def email(
topic: Optional[str] = None,
*,
mode: str = 'auto',
limit: Optional[int] = None,
recipients_file: Optional[str] = None,
dry_run: bool = False,
config_path: Optional[str] = None,
) -> None:
"""Send an email digest generated from papers.db via SMTP."""
cfg_path = config_path or _DEFAULT_CONFIG
email_cmd.run(
cfg_path,
topic,
mode=mode,
limit=limit,
dry_run=dry_run,
recipients_file=recipients_file,
)
[docs]
def export_recent(
days: int = 60,
output_name: Optional[str] = None,
config_path: Optional[str] = None,
) -> None:
"""Export recent entries from matched_entries_history.db to a smaller database.
Creates a filtered database containing only entries from the last N days for
faster initial page loads in the history viewer HTML.
Args:
days: Number of days to include (default: 60)
output_name: Optional output filename (default: matched_entries_history.recent.db)
config_path: Path to main YAML config; defaults to repo config.
"""
cfg_path = config_path or _DEFAULT_CONFIG
export_recent_cmd.run(cfg_path, days, output_name)
[docs]
def query(
*,
history: bool = False,
all_feeds: bool = False,
topic: Optional[str] = None,
min_rank: Optional[float] = None,
since: Optional[str] = None,
until: Optional[str] = None,
search: Optional[str] = None,
status: Optional[str] = None,
has_doi: bool = False,
has_abstract: bool = False,
sort: str = 'rank',
limit: int = 20,
offset: int = 0,
json: bool = False,
count: bool = False,
fields: Optional[str] = None,
config_path: Optional[str] = None,
) -> None:
"""Query paper databases and print results.
Args:
history: Query matched_entries_history.db instead of papers.db.
all_feeds: Query all_feed_entries.db instead of papers.db.
topic: Filter by topic name.
min_rank: Minimum rank_score threshold.
since: Published on or after this date (YYYY-MM-DD).
until: Published on or before this date (YYYY-MM-DD).
search: Case-insensitive text search on title and abstract.
status: Filter by entry status (current DB only).
has_doi: Only entries with a DOI.
has_abstract: Only entries with an abstract.
sort: Sort key: 'rank', 'date', or 'title'.
limit: Max results (0 = unlimited).
offset: Skip first N results.
json: Output as JSON.
count: Print count only.
fields: Comma-separated column names to include.
config_path: Path to main YAML config; defaults to repo config.
"""
if history and all_feeds:
raise ValueError("Cannot use both history and all_feeds")
db_key = 'history' if history else ('all_feeds' if all_feeds else 'current')
cfg_path = config_path or _DEFAULT_CONFIG
query_cmd.run(
cfg_path,
db_key=db_key,
topic=topic,
min_rank=min_rank,
status=status,
has_doi=has_doi,
has_abstract=has_abstract,
since=since,
until=until,
search=search,
sort=sort,
limit=limit,
offset=offset,
output_json=json,
count_only=count,
fields=fields,
)
[docs]
def purge(days: Optional[int] = None, all_data: bool = False, config_path: Optional[str] = None) -> None:
"""Purge entries from databases.
Args:
days: When provided, removes entries whose published_date falls within the
most recent N days (including today) across all databases.
all_data: If True, clears all databases and reinitializes schemas.
config_path: Path to main YAML config; defaults to repo config.
"""
if days is None and not all_data:
raise ValueError("Specify days or all_data=True")
cfg_path = config_path or _DEFAULT_CONFIG
filter_cmd.purge(cfg_path, days, all_data)
[docs]
def status(config_path: Optional[str] = None) -> Dict[str, Any]:
"""Return configuration and environment status for programmatic use."""
cfg_path = config_path or _DEFAULT_CONFIG
info: Dict[str, Any] = {'config_path': cfg_path}
if not os.path.exists(cfg_path):
info.update({'valid': False, 'error': f'Config file not found: {cfg_path}'})
return info
try:
cm = ConfigManager(cfg_path)
valid = cm.validate_config()
topics = cm.get_available_topics()
feeds = cm.get_enabled_feeds() if valid else {}
cfg = cm.load_config()
db_cfg = cfg.get('database', {}) if isinstance(cfg, dict) else {}
info.update({
'valid': bool(valid),
'topics': topics,
'enabled_feeds_count': len(feeds) if isinstance(feeds, dict) else 0,
'db_paths': db_cfg,
})
return info
except Exception as e:
info.update({'valid': False, 'error': str(e)})
return info
[docs]
def html(
topic: Optional[str] = None,
output_path: Optional[str] = None,
config_path: Optional[str] = None,
) -> None:
"""Generate HTML for one or all topics directly from papers.db.
Args:
topic: Optional topic name. When omitted, HTML is produced for all topics
defined in the configuration.
output_path: Optional output path. Only valid when *topic* is provided; when
generating all topics the configured filenames are used.
config_path: Path to main YAML config; defaults to repo config.
"""
cfg_path = config_path or _DEFAULT_CONFIG
if output_path and not topic:
raise ValueError("output_path can only be provided when generating a single topic")
config_manager = ConfigManager(cfg_path)
if not config_manager.validate_config():
raise ValueError(f"Invalid configuration at {cfg_path}")
config = config_manager.load_config()
db_manager = DatabaseManager(config)
topics_to_render = [topic] if topic else config_manager.get_available_topics()
if not topics_to_render:
db_manager.close_all_connections()
raise ValueError("No topics available in configuration")
base_generator = HTMLGenerator()
ranked_generator = HTMLGenerator(template_path='ranked_template.html')
try:
for topic_name in topics_to_render:
topic_config = config_manager.load_topic_config(topic_name)
output_config = topic_config.get('output', {})
topic_output_path = (
output_path
if topic and output_path
else output_config.get('filename', f'{topic_name}_filtered_articles.html')
)
heading = topic_config['name']
description = topic_config.get('description')
output_target = _resolve_output_path(topic_output_path)
base_generator.generate_html_from_database(
db_manager,
topic_name,
str(output_target),
heading,
description,
)
ranked_output_path = output_config.get('filename_ranked') or f'results_{topic_name}_ranked.html'
try:
ranked_target = _resolve_output_path(ranked_output_path)
ranked_generator.generate_ranked_html_from_database(
db_manager,
topic_name,
str(ranked_target),
heading,
description,
)
except Exception as exc:
logger.error("Failed to generate ranked HTML for topic '%s': %s", topic_name, exc)
finally:
db_manager.close_all_connections()
# Backward compatibility aliases (deprecated)
paperqa_summary = pqa_summary
generate_html = html