papernews/papernews.py

#!/usr/bin/env python3
"""
papernews.py — Daily AI-curated newspaper PDF

Fetches RSS feeds from configurable news sources, asks Claude to pick and
summarize the top stories, renders a newspaper-style PDF via LaTeX, delivers
it to Telegram, and prunes PDFs older than a configurable number of days.

Schedule: configurable via SCHEDULE_HOUR/SCHEDULE_MINUTE (default: 7:00am).
Timezone: configurable via TIMEZONE (default: America/Chicago).
Set RUN_NOW=true in the environment to fire once immediately on startup (for testing).
"""

import json
import logging
import os
import re
import shutil
import subprocess
import tempfile
from datetime import datetime, timedelta
from pathlib import Path
from zoneinfo import ZoneInfo

import anthropic
import feedparser
import requests
from apscheduler.schedulers.blocking import BlockingScheduler

# ── Configuration ──────────────────────────────────────────────────────────
BOT_TOKEN      = os.environ["TELEGRAM_BOT_TOKEN"]
CHAT_ID        = os.environ["TELEGRAM_CHAT_ID"]
ANTHROPIC_KEY  = os.environ["ANTHROPIC_API_KEY"]
OUTPUT_DIR     = Path("/output")
LOCATION       = os.environ.get("LOCATION", "Your City")
TZ             = ZoneInfo(os.environ.get("TIMEZONE", "America/Chicago"))
RETENTION_DAYS = int(os.environ.get("RETENTION_DAYS", "5"))
SCHEDULE_HOUR  = int(os.environ.get("SCHEDULE_HOUR", "7"))
SCHEDULE_MIN   = int(os.environ.get("SCHEDULE_MINUTE", "0"))
RUN_NOW        = os.environ.get("RUN_NOW", "").lower() == "true"

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
)
log = logging.getLogger(__name__)

# Convenience alias — used in run() and the scheduler
CT = TZ


# ── News sections and RSS feeds ────────────────────────────────────────────
# Each section has a display title and a list of (source_name, rss_url) pairs.
SECTIONS = [
    {
        "title": "Local --- Kansas City",
        "feeds": [
            ("KCUR",    "https://www.kcur.org/feed"),
            ("Fox4KC",  "https://www.fox4kc.com/feed/"),
            ("KSHB",    "https://www.kshb.com/news.rss"),
        ],
    },
    {
        "title": "National",
        "feeds": [
            ("AP News", "https://feeds.apnews.com/rss/apf-topnews"),
            ("NPR",     "https://feeds.npr.org/1001/rss.xml"),
            ("PBS",     "https://www.pbs.org/newshour/feeds/rss/headlines"),
        ],
    },
    {
        "title": "International",
        "feeds": [
            ("BBC World",    "http://feeds.bbci.co.uk/news/world/rss.xml"),
            ("The Guardian", "https://www.theguardian.com/world/rss"),
            ("Reuters",      "https://feeds.reuters.com/reuters/topNews"),
        ],
    },
    {
        "title": "Technology",
        "feeds": [
            ("Hacker News",  "https://news.ycombinator.com/rss"),
            ("Ars Technica", "http://feeds.arstechnica.com/arstechnica/index"),
        ],
    },
]

# How many headlines to collect from each feed before handing off to Claude
MAX_ARTICLES_PER_FEED = 6


# ── LaTeX helpers ──────────────────────────────────────────────────────────
# Maps each special LaTeX character to its escaped equivalent.
# translate() processes each character independently — no double-substitution risk.
_LATEX_ESCAPES = str.maketrans({
    "\\": r"\textbackslash{}",
    "&":  r"\&",
    "%":  r"\%",
    "$":  r"\$",
    "#":  r"\#",
    "_":  r"\_",
    "{":  r"\{",
    "}":  r"\}",
    "~":  r"\textasciitilde{}",
    "^":  r"\textasciicircum{}",
})


def esc(text: str) -> str:
    """Escape special LaTeX characters in user-supplied text."""
    return str(text).translate(_LATEX_ESCAPES)


# Raw string template — avoids double-escaping LaTeX braces.
# Uses simple str.replace() substitution for %(date)s and %(body)s markers.
LATEX_TEMPLATE = r"""\documentclass[10pt]{article}
\usepackage[margin=0.65in, top=0.4in, columnsep=0.3in]{geometry}
\usepackage[T1]{fontenc}
\usepackage[utf8]{inputenc}
\usepackage{lmodern}
\usepackage{multicol}
\usepackage{parskip}
\usepackage{microtype}
\usepackage{titlesec}

%% Section header: centered, bold, with a rule below
\titleformat{\section}[block]{\large\bfseries\filcenter}{}{0pt}{}[\titlerule]
\titlespacing*{\section}{0pt}{14pt}{6pt}

\setlength{\parindent}{0pt}
\setlength{\parskip}{4pt}
\pagestyle{empty}

\begin{document}

%% Masthead — full width above the two-column body
\begin{center}
  {\fontsize{42}{50}\selectfont\textbf{Papernews}}\\[2pt]
  \rule{\linewidth}{2pt}\\[2pt]
  {\footnotesize PAPERLOCATION\hfill\textit{PAPERDATE}}\\
  \rule{\linewidth}{0.4pt}
\end{center}
\vspace{6pt}

%% Two-column newspaper content
\begin{multicols}{2}

PAPERBODY

\end{multicols}
\end{document}
"""


def build_section_latex(title: str, stories: list) -> str:
    """Render one newspaper section from its AI-summarized stories."""
    lines = [rf"\section*{{{esc(title)}}}"]
    for story in stories:
        headline = esc(story.get("headline", ""))
        body     = esc(story.get("body", ""))
        lines.append(rf"\textbf{{{headline}}}")
        lines.append("")
        lines.append(body)
        lines.append("")
    return "\n".join(lines)


# ── RSS fetching ───────────────────────────────────────────────────────────
# Fetch via requests (with explicit timeout) then parse the content string,
# so feedparser never makes its own unguarded network calls that can hang forever.
_FEED_HEADERS = {"User-Agent": "Papernews/1.0 (rocklab daily digest)"}


def fetch_articles(feeds: list) -> list:
    """Pull the top headlines from each feed in a section."""
    articles = []
    for source, url in feeds:
        try:
            resp = requests.get(url, timeout=12, headers=_FEED_HEADERS)
            resp.raise_for_status()
            feed = feedparser.parse(resp.text)
            for entry in feed.entries[:MAX_ARTICLES_PER_FEED]:
                title   = entry.get("title", "").strip()
                summary = entry.get("summary", entry.get("description", "")).strip()
                # Strip HTML tags — Claude handles messy plain text fine
                summary = re.sub(r"<[^>]+>", " ", summary).strip()
                if title:
                    articles.append({
                        "source":  source,
                        "title":   title,
                        "summary": summary[:500],
                    })
        except Exception as exc:
            log.warning("Feed fetch failed — %s (%s): %s", source, url, exc)
    return articles


# ── Claude summarization ───────────────────────────────────────────────────
def summarize_section(client: anthropic.Anthropic, section_title: str, articles: list) -> list:
    """Ask Claude to select and summarize the top stories for one section."""
    if not articles:
        log.warning("No articles fetched for section: %s", section_title)
        return []

    article_text = "\n\n".join(
        f"[{a['source']}] {a['title']}\n{a['summary']}" for a in articles
    )

    prompt = (
        f"You are the {section_title} editor for a daily newspaper called Papernews.\n\n"
        "From the articles below, select the 4–6 most newsworthy stories and write a concise digest.\n\n"
        "Return ONLY a JSON array with no other text:\n"
        '[\n  {"headline": "Short headline", "body": "1–3 sentences of newspaper prose."},\n  ...\n]\n\n'
        "Rules: headlines under 10 words, body text factual and tight, no source names or URLs.\n\n"
        f"Articles:\n{article_text}"
    )

    try:
        msg = client.messages.create(
            model="claude-sonnet-4-6",
            max_tokens=2048,
            messages=[{"role": "user", "content": prompt}],
        )
        raw = msg.content[0].text.strip()
        # Strip markdown code fences if Claude wraps the JSON
        if raw.startswith("```"):
            raw = raw.split("```")[1]
            if raw.startswith("json"):
                raw = raw[4:]
        return json.loads(raw.strip())
    except Exception as exc:
        log.error("Claude summarization failed for '%s': %s", section_title, exc)
        return []


# ── PDF compilation ────────────────────────────────────────────────────────
def compile_pdf(latex: str, output_path: Path) -> bool:
    """Compile a LaTeX string to PDF and copy it to output_path."""
    with tempfile.TemporaryDirectory() as tmp:
        tex_path = Path(tmp) / "papernews.tex"
        tex_path.write_text(latex, encoding="utf-8")

        result = subprocess.run(
            ["pdflatex", "-interaction=nonstopmode", "-output-directory", tmp, str(tex_path)],
            capture_output=True,
            text=True,
        )
        pdf_src = Path(tmp) / "papernews.pdf"
        if result.returncode != 0 or not pdf_src.exists():
            log.error("pdflatex failed. Last 3000 chars of output:\n%s", result.stdout[-3000:])
            return False

        shutil.copy2(pdf_src, output_path)
        return True


# ── Telegram delivery ──────────────────────────────────────────────────────
def send_pdf(path: Path, caption: str) -> bool:
    """Upload the PDF to the configured Telegram chat."""
    url = f"https://api.telegram.org/bot{BOT_TOKEN}/sendDocument"
    try:
        with open(path, "rb") as f:
            resp = requests.post(
                url,
                data={"chat_id": CHAT_ID, "caption": caption},
                files={"document": f},
                timeout=60,
            )
        if resp.ok:
            log.info("PDF delivered to Telegram.")
            return True
        log.error("Telegram sendDocument failed: %s — %s", resp.status_code, resp.text)
        return False
    except Exception as exc:
        log.error("Telegram send error: %s", exc)
        return False


def send_message(text: str) -> None:
    """Send a plain-text message to Telegram (used for error alerts)."""
    url = f"https://api.telegram.org/bot{BOT_TOKEN}/sendMessage"
    try:
        requests.post(url, json={"chat_id": CHAT_ID, "text": text}, timeout=15)
    except Exception as exc:
        log.error("Telegram message error: %s", exc)


# ── Cleanup ────────────────────────────────────────────────────────────────
def prune_old_pdfs() -> None:
    """Delete papernews PDFs from OUTPUT_DIR older than RETENTION_DAYS."""
    cutoff = datetime.now() - timedelta(days=RETENTION_DAYS)
    for pdf in OUTPUT_DIR.glob("papernews-*.pdf"):
        if datetime.fromtimestamp(pdf.stat().st_mtime) < cutoff:
            pdf.unlink()
            log.info("Pruned: %s", pdf.name)


# ── Main pipeline ──────────────────────────────────────────────────────────
def run() -> None:
    """Full pipeline: fetch → summarize → render → deliver → prune."""
    now      = datetime.now(CT)
    date_str = now.strftime("%A, %B %-d, %Y")
    pdf_path = OUTPUT_DIR / f"papernews-{now.strftime('%Y-%m-%d')}.pdf"

    log.info("Starting Papernews run for %s", date_str)
    client = anthropic.Anthropic(api_key=ANTHROPIC_KEY)

    body_parts = []
    for section in SECTIONS:
        log.info("Processing section: %s", section["title"])
        articles = fetch_articles(section["feeds"])
        log.info("  Fetched %d articles", len(articles))
        stories = summarize_section(client, section["title"], articles)
        log.info("  Summarized to %d stories", len(stories))
        if stories:
            body_parts.append(build_section_latex(section["title"], stories))

    if not body_parts:
        msg = "Papernews: no content generated today — check container logs."
        log.error(msg)
        send_message(msg)
        return

    # Substitute location, date, and body into the template using plain str.replace()
    # so no special characters in the content can interfere with formatting.
    latex = (
        LATEX_TEMPLATE
        .replace("PAPERLOCATION", LOCATION)
        .replace("PAPERDATE", date_str)
        .replace("PAPERBODY", "\n\n".join(body_parts))
    )

    if not compile_pdf(latex, pdf_path):
        send_message("Papernews: LaTeX compile failed — check container logs.")
        return

    if not send_pdf(pdf_path, f"Papernews — {date_str}"):
        log.error("PDF compiled but Telegram delivery failed: %s", pdf_path)
        return

    prune_old_pdfs()
    log.info("Papernews complete: %s", pdf_path.name)


# ── Entry point ────────────────────────────────────────────────────────────
if __name__ == "__main__":
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

    if RUN_NOW:
        log.info("RUN_NOW=true — running immediately.")
        run()

    scheduler = BlockingScheduler(timezone=CT)
    scheduler.add_job(run, "cron", hour=SCHEDULE_HOUR, minute=SCHEDULE_MIN, id="papernews_daily")
    log.info("Papernews scheduled for %02d:%02d daily.", SCHEDULE_HOUR, SCHEDULE_MIN)
    scheduler.start()