papernews/papernews.py
Dave Campbell e31282cbb5 Initial release of Papernews
Daily AI-curated newspaper PDF delivered via Telegram. Fetches RSS feeds,
summarizes top stories with Claude, compiles a LaTeX PDF in newspaper layout.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-08 12:41:55 +00:00

356 lines
14 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
papernews.py — Daily AI-curated newspaper PDF
Fetches RSS feeds from configurable news sources, asks Claude to pick and
summarize the top stories, renders a newspaper-style PDF via LaTeX, delivers
it to Telegram, and prunes PDFs older than a configurable number of days.
Schedule: configurable via SCHEDULE_HOUR/SCHEDULE_MINUTE (default: 7:00am).
Timezone: configurable via TIMEZONE (default: America/Chicago).
Set RUN_NOW=true in the environment to fire once immediately on startup (for testing).
"""
import json
import logging
import os
import re
import shutil
import subprocess
import tempfile
from datetime import datetime, timedelta
from pathlib import Path
from zoneinfo import ZoneInfo
import anthropic
import feedparser
import requests
from apscheduler.schedulers.blocking import BlockingScheduler
# ── Configuration ──────────────────────────────────────────────────────────
BOT_TOKEN = os.environ["TELEGRAM_BOT_TOKEN"]
CHAT_ID = os.environ["TELEGRAM_CHAT_ID"]
ANTHROPIC_KEY = os.environ["ANTHROPIC_API_KEY"]
OUTPUT_DIR = Path("/output")
LOCATION = os.environ.get("LOCATION", "Your City")
TZ = ZoneInfo(os.environ.get("TIMEZONE", "America/Chicago"))
RETENTION_DAYS = int(os.environ.get("RETENTION_DAYS", "5"))
SCHEDULE_HOUR = int(os.environ.get("SCHEDULE_HOUR", "7"))
SCHEDULE_MIN = int(os.environ.get("SCHEDULE_MINUTE", "0"))
RUN_NOW = os.environ.get("RUN_NOW", "").lower() == "true"
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
)
log = logging.getLogger(__name__)
# Convenience alias — used in run() and the scheduler
CT = TZ
# ── News sections and RSS feeds ────────────────────────────────────────────
# Each section has a display title and a list of (source_name, rss_url) pairs.
SECTIONS = [
{
"title": "Local --- Kansas City",
"feeds": [
("KCUR", "https://www.kcur.org/feed"),
("Fox4KC", "https://www.fox4kc.com/feed/"),
("KSHB", "https://www.kshb.com/news.rss"),
],
},
{
"title": "National",
"feeds": [
("AP News", "https://feeds.apnews.com/rss/apf-topnews"),
("NPR", "https://feeds.npr.org/1001/rss.xml"),
("PBS", "https://www.pbs.org/newshour/feeds/rss/headlines"),
],
},
{
"title": "International",
"feeds": [
("BBC World", "http://feeds.bbci.co.uk/news/world/rss.xml"),
("The Guardian", "https://www.theguardian.com/world/rss"),
("Reuters", "https://feeds.reuters.com/reuters/topNews"),
],
},
{
"title": "Technology",
"feeds": [
("Hacker News", "https://news.ycombinator.com/rss"),
("Ars Technica", "http://feeds.arstechnica.com/arstechnica/index"),
],
},
]
# How many headlines to collect from each feed before handing off to Claude
MAX_ARTICLES_PER_FEED = 6
# ── LaTeX helpers ──────────────────────────────────────────────────────────
# Maps each special LaTeX character to its escaped equivalent.
# translate() processes each character independently — no double-substitution risk.
_LATEX_ESCAPES = str.maketrans({
"\\": r"\textbackslash{}",
"&": r"\&",
"%": r"\%",
"$": r"\$",
"#": r"\#",
"_": r"\_",
"{": r"\{",
"}": r"\}",
"~": r"\textasciitilde{}",
"^": r"\textasciicircum{}",
})
def esc(text: str) -> str:
"""Escape special LaTeX characters in user-supplied text."""
return str(text).translate(_LATEX_ESCAPES)
# Raw string template — avoids double-escaping LaTeX braces.
# Uses simple str.replace() substitution for %(date)s and %(body)s markers.
LATEX_TEMPLATE = r"""\documentclass[10pt]{article}
\usepackage[margin=0.65in, top=0.4in, columnsep=0.3in]{geometry}
\usepackage[T1]{fontenc}
\usepackage[utf8]{inputenc}
\usepackage{lmodern}
\usepackage{multicol}
\usepackage{parskip}
\usepackage{microtype}
\usepackage{titlesec}
%% Section header: centered, bold, with a rule below
\titleformat{\section}[block]{\large\bfseries\filcenter}{}{0pt}{}[\titlerule]
\titlespacing*{\section}{0pt}{14pt}{6pt}
\setlength{\parindent}{0pt}
\setlength{\parskip}{4pt}
\pagestyle{empty}
\begin{document}
%% Masthead — full width above the two-column body
\begin{center}
{\fontsize{42}{50}\selectfont\textbf{Papernews}}\\[2pt]
\rule{\linewidth}{2pt}\\[2pt]
{\footnotesize PAPERLOCATION\hfill\textit{PAPERDATE}}\\
\rule{\linewidth}{0.4pt}
\end{center}
\vspace{6pt}
%% Two-column newspaper content
\begin{multicols}{2}
PAPERBODY
\end{multicols}
\end{document}
"""
def build_section_latex(title: str, stories: list) -> str:
"""Render one newspaper section from its AI-summarized stories."""
lines = [rf"\section*{{{esc(title)}}}"]
for story in stories:
headline = esc(story.get("headline", ""))
body = esc(story.get("body", ""))
lines.append(rf"\textbf{{{headline}}}")
lines.append("")
lines.append(body)
lines.append("")
return "\n".join(lines)
# ── RSS fetching ───────────────────────────────────────────────────────────
# Fetch via requests (with explicit timeout) then parse the content string,
# so feedparser never makes its own unguarded network calls that can hang forever.
_FEED_HEADERS = {"User-Agent": "Papernews/1.0 (rocklab daily digest)"}
def fetch_articles(feeds: list) -> list:
"""Pull the top headlines from each feed in a section."""
articles = []
for source, url in feeds:
try:
resp = requests.get(url, timeout=12, headers=_FEED_HEADERS)
resp.raise_for_status()
feed = feedparser.parse(resp.text)
for entry in feed.entries[:MAX_ARTICLES_PER_FEED]:
title = entry.get("title", "").strip()
summary = entry.get("summary", entry.get("description", "")).strip()
# Strip HTML tags — Claude handles messy plain text fine
summary = re.sub(r"<[^>]+>", " ", summary).strip()
if title:
articles.append({
"source": source,
"title": title,
"summary": summary[:500],
})
except Exception as exc:
log.warning("Feed fetch failed — %s (%s): %s", source, url, exc)
return articles
# ── Claude summarization ───────────────────────────────────────────────────
def summarize_section(client: anthropic.Anthropic, section_title: str, articles: list) -> list:
"""Ask Claude to select and summarize the top stories for one section."""
if not articles:
log.warning("No articles fetched for section: %s", section_title)
return []
article_text = "\n\n".join(
f"[{a['source']}] {a['title']}\n{a['summary']}" for a in articles
)
prompt = (
f"You are the {section_title} editor for a daily newspaper called Papernews.\n\n"
"From the articles below, select the 46 most newsworthy stories and write a concise digest.\n\n"
"Return ONLY a JSON array with no other text:\n"
'[\n {"headline": "Short headline", "body": "13 sentences of newspaper prose."},\n ...\n]\n\n'
"Rules: headlines under 10 words, body text factual and tight, no source names or URLs.\n\n"
f"Articles:\n{article_text}"
)
try:
msg = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=2048,
messages=[{"role": "user", "content": prompt}],
)
raw = msg.content[0].text.strip()
# Strip markdown code fences if Claude wraps the JSON
if raw.startswith("```"):
raw = raw.split("```")[1]
if raw.startswith("json"):
raw = raw[4:]
return json.loads(raw.strip())
except Exception as exc:
log.error("Claude summarization failed for '%s': %s", section_title, exc)
return []
# ── PDF compilation ────────────────────────────────────────────────────────
def compile_pdf(latex: str, output_path: Path) -> bool:
"""Compile a LaTeX string to PDF and copy it to output_path."""
with tempfile.TemporaryDirectory() as tmp:
tex_path = Path(tmp) / "papernews.tex"
tex_path.write_text(latex, encoding="utf-8")
result = subprocess.run(
["pdflatex", "-interaction=nonstopmode", "-output-directory", tmp, str(tex_path)],
capture_output=True,
text=True,
)
pdf_src = Path(tmp) / "papernews.pdf"
if result.returncode != 0 or not pdf_src.exists():
log.error("pdflatex failed. Last 3000 chars of output:\n%s", result.stdout[-3000:])
return False
shutil.copy2(pdf_src, output_path)
return True
# ── Telegram delivery ──────────────────────────────────────────────────────
def send_pdf(path: Path, caption: str) -> bool:
"""Upload the PDF to the configured Telegram chat."""
url = f"https://api.telegram.org/bot{BOT_TOKEN}/sendDocument"
try:
with open(path, "rb") as f:
resp = requests.post(
url,
data={"chat_id": CHAT_ID, "caption": caption},
files={"document": f},
timeout=60,
)
if resp.ok:
log.info("PDF delivered to Telegram.")
return True
log.error("Telegram sendDocument failed: %s%s", resp.status_code, resp.text)
return False
except Exception as exc:
log.error("Telegram send error: %s", exc)
return False
def send_message(text: str) -> None:
"""Send a plain-text message to Telegram (used for error alerts)."""
url = f"https://api.telegram.org/bot{BOT_TOKEN}/sendMessage"
try:
requests.post(url, json={"chat_id": CHAT_ID, "text": text}, timeout=15)
except Exception as exc:
log.error("Telegram message error: %s", exc)
# ── Cleanup ────────────────────────────────────────────────────────────────
def prune_old_pdfs() -> None:
"""Delete papernews PDFs from OUTPUT_DIR older than RETENTION_DAYS."""
cutoff = datetime.now() - timedelta(days=RETENTION_DAYS)
for pdf in OUTPUT_DIR.glob("papernews-*.pdf"):
if datetime.fromtimestamp(pdf.stat().st_mtime) < cutoff:
pdf.unlink()
log.info("Pruned: %s", pdf.name)
# ── Main pipeline ──────────────────────────────────────────────────────────
def run() -> None:
"""Full pipeline: fetch → summarize → render → deliver → prune."""
now = datetime.now(CT)
date_str = now.strftime("%A, %B %-d, %Y")
pdf_path = OUTPUT_DIR / f"papernews-{now.strftime('%Y-%m-%d')}.pdf"
log.info("Starting Papernews run for %s", date_str)
client = anthropic.Anthropic(api_key=ANTHROPIC_KEY)
body_parts = []
for section in SECTIONS:
log.info("Processing section: %s", section["title"])
articles = fetch_articles(section["feeds"])
log.info(" Fetched %d articles", len(articles))
stories = summarize_section(client, section["title"], articles)
log.info(" Summarized to %d stories", len(stories))
if stories:
body_parts.append(build_section_latex(section["title"], stories))
if not body_parts:
msg = "Papernews: no content generated today — check container logs."
log.error(msg)
send_message(msg)
return
# Substitute location, date, and body into the template using plain str.replace()
# so no special characters in the content can interfere with formatting.
latex = (
LATEX_TEMPLATE
.replace("PAPERLOCATION", LOCATION)
.replace("PAPERDATE", date_str)
.replace("PAPERBODY", "\n\n".join(body_parts))
)
if not compile_pdf(latex, pdf_path):
send_message("Papernews: LaTeX compile failed — check container logs.")
return
if not send_pdf(pdf_path, f"Papernews — {date_str}"):
log.error("PDF compiled but Telegram delivery failed: %s", pdf_path)
return
prune_old_pdfs()
log.info("Papernews complete: %s", pdf_path.name)
# ── Entry point ────────────────────────────────────────────────────────────
if __name__ == "__main__":
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
if RUN_NOW:
log.info("RUN_NOW=true — running immediately.")
run()
scheduler = BlockingScheduler(timezone=CT)
scheduler.add_job(run, "cron", hour=SCHEDULE_HOUR, minute=SCHEDULE_MIN, id="papernews_daily")
log.info("Papernews scheduled for %02d:%02d daily.", SCHEDULE_HOUR, SCHEDULE_MIN)
scheduler.start()