Source code for paper_firehose.commands.generate_html

"""
Generate topic HTML directly from the current-run database (papers.db).

This bypasses fetching/filtering and renders HTML for one or all topics
using entries already stored in papers.db (status='filtered').
"""

import logging
from typing import Optional

from ..core.config import ConfigManager
from ..core.database import DatabaseManager
from ..core.paths import resolve_data_path
from ..processors.html_generator import HTMLGenerator

logger = logging.getLogger(__name__)


[docs] def run(config_path: str, topic: Optional[str] = None) -> None: """ Generate HTML for a specific topic or all topics directly from papers.db. Args: config_path: Path to the main configuration file topic: Optional specific topic to render (if None, render all topics) """ logger.info("Starting HTML generation from database") # Initialize components config_manager = ConfigManager(config_path) if not config_manager.validate_config(): logger.error("Configuration validation failed") return config = config_manager.load_config() db_manager = DatabaseManager(config) html_generator = HTMLGenerator() # Determine topics to render if topic: topics_to_render = [topic] logger.info(f"Rendering specific topic: {topic}") else: topics_to_render = config_manager.get_available_topics() logger.info(f"Rendering all topics: {topics_to_render}") for topic_name in topics_to_render: try: topic_config = config_manager.load_topic_config(topic_name) output_config = topic_config.get('output', {}) output_filename = output_config.get('filename', f'{topic_name}_filtered_articles.html') output_path = resolve_data_path('html', output_filename, ensure_parent=True) # Use the topic's display name and description heading = topic_config.get('name', topic_name) subheading = topic_config.get('description') # Generate from DB for this topic html_generator.generate_html_from_database( db_manager, topic_name, str(output_path), heading, subheading, ) logger.info(f"Generated HTML for topic '{topic_name}': {output_path}") # Always generate ranked HTML from current DB state to avoid stale files try: ranked_filename = output_config.get('filename_ranked') or f'results_{topic_name}_ranked.html' ranked_path = resolve_data_path('html', ranked_filename, ensure_parent=True) ranked_template = 'ranked_template.html' ranked_gen = HTMLGenerator(template_path=ranked_template) ranked_gen.generate_ranked_html_from_database(db_manager, topic_name, str(ranked_path), heading, subheading) logger.info(f"Generated ranked HTML for topic '{topic_name}': {ranked_path}") except Exception as e: logger.error(f"Failed to generate ranked HTML for topic '{topic_name}': {e}") except Exception as e: logger.error(f"Error generating HTML for topic '{topic_name}': {e}") continue # Generate summarized HTML for each topic that has summaries try: html_gen = HTMLGenerator(template_path="llmsummary_template.html") for topic_name in topics_to_render: try: topic_config = config_manager.load_topic_config(topic_name) output_config = topic_config.get('output', {}) summary_filename = output_config.get('filename_summary') if summary_filename: summary_path = resolve_data_path('html', summary_filename, ensure_parent=True) topic_display_name = topic_config.get('name', topic_name) topic_description = topic_config.get('description') # Always generate the summary page. The generator prefers PQA summaries # and falls back to ranked fields when none are available. html_gen.generate_pqa_summarized_html_from_database( db_manager, topic_name, str(summary_path), f"PDF Summaries - {topic_display_name}", topic_description ) logger.info("Generated summarized HTML for topic '%s': %s", topic_name, summary_path) except Exception as e: logger.error("Failed to generate summarized HTML for topic '%s': %s", topic_name, e) except Exception as e: logger.error("Failed to generate summarized HTML: %s", e) db_manager.close_all_connections() logger.info("HTML generation from database completed")