Source code for paper_firehose

from __future__ import annotations

import logging
import os
from importlib.metadata import version as _get_version, PackageNotFoundError
from pathlib import Path
from typing import Any, Dict, List, Optional

# Version from package metadata (defined in pyproject.toml)
try:
    __version__ = _get_version("paper_firehose")
except PackageNotFoundError:
    __version__ = "0.0.0.dev"  # Fallback for editable installs without metadata

from .commands import filter as filter_cmd
from .commands import rank as rank_cmd
from .commands import abstracts as abstracts_cmd
from .commands import pqa_summary as pqa_summary_cmd
from .commands import email_list as email_cmd
from .commands import export_recent as export_recent_cmd
from .commands import query as query_cmd
from .core.config import ConfigManager, DEFAULT_CONFIG_PATH
from .core.database import DatabaseManager
from .core.paths import resolve_data_path
from .processors.html_generator import HTMLGenerator

logger = logging.getLogger(__name__)

_DEFAULT_CONFIG = str(DEFAULT_CONFIG_PATH)

__all__ = [
    '__version__',
    'filter',
    'rank',
    'abstracts',
    'pqa_summary',
    'paperqa_summary',
    'email',
    'purge',
    'status',
    'html',
    'generate_html',
    'export_recent',
    'query',
]


def _resolve_output_path(path: str) -> Path:
    """Resolve HTML output paths under the runtime data directory."""
    candidate = Path(path)
    if candidate.is_absolute():
        candidate.parent.mkdir(parents=True, exist_ok=True)
        return candidate
    return resolve_data_path('html', *candidate.parts, ensure_parent=True)



[docs]
def filter(topic: Optional[str] = None, config_path: Optional[str] = None) -> None:
    """Run the filter step programmatically.

    Args:
        topic: Optional topic name to process; if None, process all topics.
        config_path: Path to main YAML config; defaults to repo config.
    """
    cfg_path = config_path or _DEFAULT_CONFIG
    filter_cmd.run(cfg_path, topic)




[docs]
def rank(topic: Optional[str] = None, config_path: Optional[str] = None) -> None:
    """Compute and write rank scores into papers.db for the given topic (or all)."""
    cfg_path = config_path or _DEFAULT_CONFIG
    rank_cmd.run(cfg_path, topic)




[docs]
def abstracts(
    topic: Optional[str] = None,
    *,
    mailto: Optional[str] = None,
    limit: Optional[int] = None,
    rps: Optional[float] = None,
    config_path: Optional[str] = None,
) -> None:
    """Fetch abstracts for ranked entries and write to papers.db/history.

    Args:
        topic: Restrict to a single topic (optional)
        mailto: Contact email for Crossref UA (optional)
        limit: Max abstracts per topic (optional)
        rps: Requests/second throttle (optional)
        config_path: Path to config (optional)
    """
    cfg_path = config_path or _DEFAULT_CONFIG
    abstracts_cmd.run(cfg_path, topic, mailto=mailto, max_per_topic=limit, rps=rps or 1.0)




[docs]
def pqa_summary(
    topic: Optional[str] = None,
    *,
    rps: Optional[float] = None,
    limit: Optional[int] = None,
    arxiv: Optional[List[str]] = None,
    entry_ids: Optional[List[str]] = None,
    use_history: bool = False,
    history_date: Optional[str] = None,
    history_feed_like: Optional[str] = None,
    config_path: Optional[str] = None,
) -> None:
    """Run the paper-qa pipeline to download PDFs and write grounded summaries.

    Args:
        topic: Optional topic name to target ranked entries; when omitted and no
            IDs are supplied, all configured topics are scanned.
        rps: Optional requests-per-second override for arXiv lookups/downloads.
        limit: Optional cap on number of ranked entries per topic.
        arxiv: Optional list of arXiv IDs/URLs to process directly (bypass ranking).
        entry_ids: Optional list of database entry IDs to summarize (history lookup).
        use_history: When True, resolve `entry_ids` against the history database.
        history_date: Optional YYYY-MM-DD filter when querying history records.
        history_feed_like: Optional substring filter for history feed names.
        config_path: Path to main YAML config; defaults to repo config.
    """
    cfg_path = config_path or _DEFAULT_CONFIG
    pqa_summary_cmd.run(
        cfg_path,
        topic,
        rps=rps,
        limit=limit,
        arxiv=arxiv,
        entry_ids=entry_ids,
        use_history=use_history,
        history_date=history_date,
        history_feed_like=history_feed_like,
    )




[docs]
def email(
    topic: Optional[str] = None,
    *,
    mode: str = 'auto',
    limit: Optional[int] = None,
    recipients_file: Optional[str] = None,
    dry_run: bool = False,
    config_path: Optional[str] = None,
) -> None:
    """Send an email digest generated from papers.db via SMTP."""
    cfg_path = config_path or _DEFAULT_CONFIG
    email_cmd.run(
        cfg_path,
        topic,
        mode=mode,
        limit=limit,
        dry_run=dry_run,
        recipients_file=recipients_file,
    )




[docs]
def export_recent(
    days: int = 60,
    output_name: Optional[str] = None,
    config_path: Optional[str] = None,
) -> None:
    """Export recent entries from matched_entries_history.db to a smaller database.

    Creates a filtered database containing only entries from the last N days for
    faster initial page loads in the history viewer HTML.

    Args:
        days: Number of days to include (default: 60)
        output_name: Optional output filename (default: matched_entries_history.recent.db)
        config_path: Path to main YAML config; defaults to repo config.
    """
    cfg_path = config_path or _DEFAULT_CONFIG
    export_recent_cmd.run(cfg_path, days, output_name)




[docs]
def query(
    *,
    history: bool = False,
    all_feeds: bool = False,
    topic: Optional[str] = None,
    min_rank: Optional[float] = None,
    since: Optional[str] = None,
    until: Optional[str] = None,
    search: Optional[str] = None,
    status: Optional[str] = None,
    has_doi: bool = False,
    has_abstract: bool = False,
    sort: str = 'rank',
    limit: int = 20,
    offset: int = 0,
    json: bool = False,
    count: bool = False,
    fields: Optional[str] = None,
    config_path: Optional[str] = None,
) -> None:
    """Query paper databases and print results.

    Args:
        history: Query matched_entries_history.db instead of papers.db.
        all_feeds: Query all_feed_entries.db instead of papers.db.
        topic: Filter by topic name.
        min_rank: Minimum rank_score threshold.
        since: Published on or after this date (YYYY-MM-DD).
        until: Published on or before this date (YYYY-MM-DD).
        search: Case-insensitive text search on title and abstract.
        status: Filter by entry status (current DB only).
        has_doi: Only entries with a DOI.
        has_abstract: Only entries with an abstract.
        sort: Sort key: 'rank', 'date', or 'title'.
        limit: Max results (0 = unlimited).
        offset: Skip first N results.
        json: Output as JSON.
        count: Print count only.
        fields: Comma-separated column names to include.
        config_path: Path to main YAML config; defaults to repo config.
    """
    if history and all_feeds:
        raise ValueError("Cannot use both history and all_feeds")
    db_key = 'history' if history else ('all_feeds' if all_feeds else 'current')
    cfg_path = config_path or _DEFAULT_CONFIG
    query_cmd.run(
        cfg_path,
        db_key=db_key,
        topic=topic,
        min_rank=min_rank,
        status=status,
        has_doi=has_doi,
        has_abstract=has_abstract,
        since=since,
        until=until,
        search=search,
        sort=sort,
        limit=limit,
        offset=offset,
        output_json=json,
        count_only=count,
        fields=fields,
    )




[docs]
def purge(days: Optional[int] = None, all_data: bool = False, config_path: Optional[str] = None) -> None:
    """Purge entries from databases.

    Args:
        days: When provided, removes entries whose published_date falls within the
              most recent N days (including today) across all databases.
        all_data: If True, clears all databases and reinitializes schemas.
        config_path: Path to main YAML config; defaults to repo config.
    """
    if days is None and not all_data:
        raise ValueError("Specify days or all_data=True")
    cfg_path = config_path or _DEFAULT_CONFIG
    filter_cmd.purge(cfg_path, days, all_data)




[docs]
def status(config_path: Optional[str] = None) -> Dict[str, Any]:
    """Return configuration and environment status for programmatic use."""
    cfg_path = config_path or _DEFAULT_CONFIG
    info: Dict[str, Any] = {'config_path': cfg_path}
    if not os.path.exists(cfg_path):
        info.update({'valid': False, 'error': f'Config file not found: {cfg_path}'})
        return info
    try:
        cm = ConfigManager(cfg_path)
        valid = cm.validate_config()
        topics = cm.get_available_topics()
        feeds = cm.get_enabled_feeds() if valid else {}
        cfg = cm.load_config()
        db_cfg = cfg.get('database', {}) if isinstance(cfg, dict) else {}
        info.update({
            'valid': bool(valid),
            'topics': topics,
            'enabled_feeds_count': len(feeds) if isinstance(feeds, dict) else 0,
            'db_paths': db_cfg,
        })
        return info
    except Exception as e:
        info.update({'valid': False, 'error': str(e)})
        return info




[docs]
def html(
    topic: Optional[str] = None,
    output_path: Optional[str] = None,
    config_path: Optional[str] = None,
) -> None:
    """Generate HTML for one or all topics directly from papers.db.

    Args:
        topic: Optional topic name. When omitted, HTML is produced for all topics
            defined in the configuration.
        output_path: Optional output path. Only valid when *topic* is provided; when
            generating all topics the configured filenames are used.
        config_path: Path to main YAML config; defaults to repo config.
    """
    cfg_path = config_path or _DEFAULT_CONFIG

    if output_path and not topic:
        raise ValueError("output_path can only be provided when generating a single topic")

    config_manager = ConfigManager(cfg_path)
    if not config_manager.validate_config():
        raise ValueError(f"Invalid configuration at {cfg_path}")

    config = config_manager.load_config()
    db_manager = DatabaseManager(config)

    topics_to_render = [topic] if topic else config_manager.get_available_topics()
    if not topics_to_render:
        db_manager.close_all_connections()
        raise ValueError("No topics available in configuration")

    base_generator = HTMLGenerator()
    ranked_generator = HTMLGenerator(template_path='ranked_template.html')

    try:
        for topic_name in topics_to_render:
            topic_config = config_manager.load_topic_config(topic_name)
            output_config = topic_config.get('output', {})
            topic_output_path = (
                output_path
                if topic and output_path
                else output_config.get('filename', f'{topic_name}_filtered_articles.html')
            )

            heading = topic_config['name']
            description = topic_config.get('description')

            output_target = _resolve_output_path(topic_output_path)

            base_generator.generate_html_from_database(
                db_manager,
                topic_name,
                str(output_target),
                heading,
                description,
            )

            ranked_output_path = output_config.get('filename_ranked') or f'results_{topic_name}_ranked.html'
            try:
                ranked_target = _resolve_output_path(ranked_output_path)
                ranked_generator.generate_ranked_html_from_database(
                    db_manager,
                    topic_name,
                    str(ranked_target),
                    heading,
                    description,
                )
            except Exception as exc:
                logger.error("Failed to generate ranked HTML for topic '%s': %s", topic_name, exc)
    finally:
        db_manager.close_all_connections()



# Backward compatibility aliases (deprecated)
paperqa_summary = pqa_summary
generate_html = html