Source code for paper_firehose.commands.generate_html
"""
Generate topic HTML directly from the current-run database (papers.db).
This bypasses fetching/filtering and renders HTML for one or all topics
using entries already stored in papers.db (status='filtered').
"""
import logging
from typing import Optional
from ..core.config import ConfigManager
from ..core.database import DatabaseManager
from ..core.paths import resolve_data_path
from ..processors.html_generator import HTMLGenerator
logger = logging.getLogger(__name__)
[docs]
def run(config_path: str, topic: Optional[str] = None) -> None:
"""
Generate HTML for a specific topic or all topics directly from papers.db.
Args:
config_path: Path to the main configuration file
topic: Optional specific topic to render (if None, render all topics)
"""
logger.info("Starting HTML generation from database")
# Initialize components
config_manager = ConfigManager(config_path)
if not config_manager.validate_config():
logger.error("Configuration validation failed")
return
config = config_manager.load_config()
db_manager = DatabaseManager(config)
html_generator = HTMLGenerator()
# Determine topics to render
if topic:
topics_to_render = [topic]
logger.info(f"Rendering specific topic: {topic}")
else:
topics_to_render = config_manager.get_available_topics()
logger.info(f"Rendering all topics: {topics_to_render}")
for topic_name in topics_to_render:
try:
topic_config = config_manager.load_topic_config(topic_name)
output_config = topic_config.get('output', {})
output_filename = output_config.get('filename', f'{topic_name}_filtered_articles.html')
output_path = resolve_data_path('html', output_filename, ensure_parent=True)
# Use the topic's display name and description
heading = topic_config.get('name', topic_name)
subheading = topic_config.get('description')
# Generate from DB for this topic
html_generator.generate_html_from_database(
db_manager,
topic_name,
str(output_path),
heading,
subheading,
)
logger.info(f"Generated HTML for topic '{topic_name}': {output_path}")
# Always generate ranked HTML from current DB state to avoid stale files
try:
ranked_filename = output_config.get('filename_ranked') or f'results_{topic_name}_ranked.html'
ranked_path = resolve_data_path('html', ranked_filename, ensure_parent=True)
ranked_template = 'ranked_template.html'
ranked_gen = HTMLGenerator(template_path=ranked_template)
ranked_gen.generate_ranked_html_from_database(db_manager, topic_name, str(ranked_path), heading, subheading)
logger.info(f"Generated ranked HTML for topic '{topic_name}': {ranked_path}")
except Exception as e:
logger.error(f"Failed to generate ranked HTML for topic '{topic_name}': {e}")
except Exception as e:
logger.error(f"Error generating HTML for topic '{topic_name}': {e}")
continue
# Generate summarized HTML for each topic that has summaries
try:
html_gen = HTMLGenerator(template_path="llmsummary_template.html")
for topic_name in topics_to_render:
try:
topic_config = config_manager.load_topic_config(topic_name)
output_config = topic_config.get('output', {})
summary_filename = output_config.get('filename_summary')
if summary_filename:
summary_path = resolve_data_path('html', summary_filename, ensure_parent=True)
topic_display_name = topic_config.get('name', topic_name)
topic_description = topic_config.get('description')
# Always generate the summary page. The generator prefers PQA summaries
# and falls back to ranked fields when none are available.
html_gen.generate_pqa_summarized_html_from_database(
db_manager,
topic_name,
str(summary_path),
f"PDF Summaries - {topic_display_name}",
topic_description
)
logger.info("Generated summarized HTML for topic '%s': %s", topic_name, summary_path)
except Exception as e:
logger.error("Failed to generate summarized HTML for topic '%s': %s", topic_name, e)
except Exception as e:
logger.error("Failed to generate summarized HTML: %s", e)
db_manager.close_all_connections()
logger.info("HTML generation from database completed")