Daily AI-curated newspaper PDF delivered via Telegram. Fetches RSS feeds, summarizes top stories with Claude, compiles a LaTeX PDF in newspaper layout. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
356 lines
14 KiB
Python
356 lines
14 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
papernews.py — Daily AI-curated newspaper PDF
|
||
|
||
Fetches RSS feeds from configurable news sources, asks Claude to pick and
|
||
summarize the top stories, renders a newspaper-style PDF via LaTeX, delivers
|
||
it to Telegram, and prunes PDFs older than a configurable number of days.
|
||
|
||
Schedule: configurable via SCHEDULE_HOUR/SCHEDULE_MINUTE (default: 7:00am).
|
||
Timezone: configurable via TIMEZONE (default: America/Chicago).
|
||
Set RUN_NOW=true in the environment to fire once immediately on startup (for testing).
|
||
"""
|
||
|
||
import json
|
||
import logging
|
||
import os
|
||
import re
|
||
import shutil
|
||
import subprocess
|
||
import tempfile
|
||
from datetime import datetime, timedelta
|
||
from pathlib import Path
|
||
from zoneinfo import ZoneInfo
|
||
|
||
import anthropic
|
||
import feedparser
|
||
import requests
|
||
from apscheduler.schedulers.blocking import BlockingScheduler
|
||
|
||
# ── Configuration ──────────────────────────────────────────────────────────
|
||
BOT_TOKEN = os.environ["TELEGRAM_BOT_TOKEN"]
|
||
CHAT_ID = os.environ["TELEGRAM_CHAT_ID"]
|
||
ANTHROPIC_KEY = os.environ["ANTHROPIC_API_KEY"]
|
||
OUTPUT_DIR = Path("/output")
|
||
LOCATION = os.environ.get("LOCATION", "Your City")
|
||
TZ = ZoneInfo(os.environ.get("TIMEZONE", "America/Chicago"))
|
||
RETENTION_DAYS = int(os.environ.get("RETENTION_DAYS", "5"))
|
||
SCHEDULE_HOUR = int(os.environ.get("SCHEDULE_HOUR", "7"))
|
||
SCHEDULE_MIN = int(os.environ.get("SCHEDULE_MINUTE", "0"))
|
||
RUN_NOW = os.environ.get("RUN_NOW", "").lower() == "true"
|
||
|
||
logging.basicConfig(
|
||
level=logging.INFO,
|
||
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
||
)
|
||
log = logging.getLogger(__name__)
|
||
|
||
# Convenience alias — used in run() and the scheduler
|
||
CT = TZ
|
||
|
||
|
||
# ── News sections and RSS feeds ────────────────────────────────────────────
|
||
# Each section has a display title and a list of (source_name, rss_url) pairs.
|
||
SECTIONS = [
|
||
{
|
||
"title": "Local --- Kansas City",
|
||
"feeds": [
|
||
("KCUR", "https://www.kcur.org/feed"),
|
||
("Fox4KC", "https://www.fox4kc.com/feed/"),
|
||
("KSHB", "https://www.kshb.com/news.rss"),
|
||
],
|
||
},
|
||
{
|
||
"title": "National",
|
||
"feeds": [
|
||
("AP News", "https://feeds.apnews.com/rss/apf-topnews"),
|
||
("NPR", "https://feeds.npr.org/1001/rss.xml"),
|
||
("PBS", "https://www.pbs.org/newshour/feeds/rss/headlines"),
|
||
],
|
||
},
|
||
{
|
||
"title": "International",
|
||
"feeds": [
|
||
("BBC World", "http://feeds.bbci.co.uk/news/world/rss.xml"),
|
||
("The Guardian", "https://www.theguardian.com/world/rss"),
|
||
("Reuters", "https://feeds.reuters.com/reuters/topNews"),
|
||
],
|
||
},
|
||
{
|
||
"title": "Technology",
|
||
"feeds": [
|
||
("Hacker News", "https://news.ycombinator.com/rss"),
|
||
("Ars Technica", "http://feeds.arstechnica.com/arstechnica/index"),
|
||
],
|
||
},
|
||
]
|
||
|
||
# How many headlines to collect from each feed before handing off to Claude
|
||
MAX_ARTICLES_PER_FEED = 6
|
||
|
||
|
||
# ── LaTeX helpers ──────────────────────────────────────────────────────────
|
||
# Maps each special LaTeX character to its escaped equivalent.
|
||
# translate() processes each character independently — no double-substitution risk.
|
||
_LATEX_ESCAPES = str.maketrans({
|
||
"\\": r"\textbackslash{}",
|
||
"&": r"\&",
|
||
"%": r"\%",
|
||
"$": r"\$",
|
||
"#": r"\#",
|
||
"_": r"\_",
|
||
"{": r"\{",
|
||
"}": r"\}",
|
||
"~": r"\textasciitilde{}",
|
||
"^": r"\textasciicircum{}",
|
||
})
|
||
|
||
|
||
def esc(text: str) -> str:
|
||
"""Escape special LaTeX characters in user-supplied text."""
|
||
return str(text).translate(_LATEX_ESCAPES)
|
||
|
||
|
||
# Raw string template — avoids double-escaping LaTeX braces.
|
||
# Uses simple str.replace() substitution for %(date)s and %(body)s markers.
|
||
LATEX_TEMPLATE = r"""\documentclass[10pt]{article}
|
||
\usepackage[margin=0.65in, top=0.4in, columnsep=0.3in]{geometry}
|
||
\usepackage[T1]{fontenc}
|
||
\usepackage[utf8]{inputenc}
|
||
\usepackage{lmodern}
|
||
\usepackage{multicol}
|
||
\usepackage{parskip}
|
||
\usepackage{microtype}
|
||
\usepackage{titlesec}
|
||
|
||
%% Section header: centered, bold, with a rule below
|
||
\titleformat{\section}[block]{\large\bfseries\filcenter}{}{0pt}{}[\titlerule]
|
||
\titlespacing*{\section}{0pt}{14pt}{6pt}
|
||
|
||
\setlength{\parindent}{0pt}
|
||
\setlength{\parskip}{4pt}
|
||
\pagestyle{empty}
|
||
|
||
\begin{document}
|
||
|
||
%% Masthead — full width above the two-column body
|
||
\begin{center}
|
||
{\fontsize{42}{50}\selectfont\textbf{Papernews}}\\[2pt]
|
||
\rule{\linewidth}{2pt}\\[2pt]
|
||
{\footnotesize PAPERLOCATION\hfill\textit{PAPERDATE}}\\
|
||
\rule{\linewidth}{0.4pt}
|
||
\end{center}
|
||
\vspace{6pt}
|
||
|
||
%% Two-column newspaper content
|
||
\begin{multicols}{2}
|
||
|
||
PAPERBODY
|
||
|
||
\end{multicols}
|
||
\end{document}
|
||
"""
|
||
|
||
|
||
def build_section_latex(title: str, stories: list) -> str:
|
||
"""Render one newspaper section from its AI-summarized stories."""
|
||
lines = [rf"\section*{{{esc(title)}}}"]
|
||
for story in stories:
|
||
headline = esc(story.get("headline", ""))
|
||
body = esc(story.get("body", ""))
|
||
lines.append(rf"\textbf{{{headline}}}")
|
||
lines.append("")
|
||
lines.append(body)
|
||
lines.append("")
|
||
return "\n".join(lines)
|
||
|
||
|
||
# ── RSS fetching ───────────────────────────────────────────────────────────
|
||
# Fetch via requests (with explicit timeout) then parse the content string,
|
||
# so feedparser never makes its own unguarded network calls that can hang forever.
|
||
_FEED_HEADERS = {"User-Agent": "Papernews/1.0 (rocklab daily digest)"}
|
||
|
||
|
||
def fetch_articles(feeds: list) -> list:
|
||
"""Pull the top headlines from each feed in a section."""
|
||
articles = []
|
||
for source, url in feeds:
|
||
try:
|
||
resp = requests.get(url, timeout=12, headers=_FEED_HEADERS)
|
||
resp.raise_for_status()
|
||
feed = feedparser.parse(resp.text)
|
||
for entry in feed.entries[:MAX_ARTICLES_PER_FEED]:
|
||
title = entry.get("title", "").strip()
|
||
summary = entry.get("summary", entry.get("description", "")).strip()
|
||
# Strip HTML tags — Claude handles messy plain text fine
|
||
summary = re.sub(r"<[^>]+>", " ", summary).strip()
|
||
if title:
|
||
articles.append({
|
||
"source": source,
|
||
"title": title,
|
||
"summary": summary[:500],
|
||
})
|
||
except Exception as exc:
|
||
log.warning("Feed fetch failed — %s (%s): %s", source, url, exc)
|
||
return articles
|
||
|
||
|
||
# ── Claude summarization ───────────────────────────────────────────────────
|
||
def summarize_section(client: anthropic.Anthropic, section_title: str, articles: list) -> list:
|
||
"""Ask Claude to select and summarize the top stories for one section."""
|
||
if not articles:
|
||
log.warning("No articles fetched for section: %s", section_title)
|
||
return []
|
||
|
||
article_text = "\n\n".join(
|
||
f"[{a['source']}] {a['title']}\n{a['summary']}" for a in articles
|
||
)
|
||
|
||
prompt = (
|
||
f"You are the {section_title} editor for a daily newspaper called Papernews.\n\n"
|
||
"From the articles below, select the 4–6 most newsworthy stories and write a concise digest.\n\n"
|
||
"Return ONLY a JSON array with no other text:\n"
|
||
'[\n {"headline": "Short headline", "body": "1–3 sentences of newspaper prose."},\n ...\n]\n\n'
|
||
"Rules: headlines under 10 words, body text factual and tight, no source names or URLs.\n\n"
|
||
f"Articles:\n{article_text}"
|
||
)
|
||
|
||
try:
|
||
msg = client.messages.create(
|
||
model="claude-sonnet-4-6",
|
||
max_tokens=2048,
|
||
messages=[{"role": "user", "content": prompt}],
|
||
)
|
||
raw = msg.content[0].text.strip()
|
||
# Strip markdown code fences if Claude wraps the JSON
|
||
if raw.startswith("```"):
|
||
raw = raw.split("```")[1]
|
||
if raw.startswith("json"):
|
||
raw = raw[4:]
|
||
return json.loads(raw.strip())
|
||
except Exception as exc:
|
||
log.error("Claude summarization failed for '%s': %s", section_title, exc)
|
||
return []
|
||
|
||
|
||
# ── PDF compilation ────────────────────────────────────────────────────────
|
||
def compile_pdf(latex: str, output_path: Path) -> bool:
|
||
"""Compile a LaTeX string to PDF and copy it to output_path."""
|
||
with tempfile.TemporaryDirectory() as tmp:
|
||
tex_path = Path(tmp) / "papernews.tex"
|
||
tex_path.write_text(latex, encoding="utf-8")
|
||
|
||
result = subprocess.run(
|
||
["pdflatex", "-interaction=nonstopmode", "-output-directory", tmp, str(tex_path)],
|
||
capture_output=True,
|
||
text=True,
|
||
)
|
||
pdf_src = Path(tmp) / "papernews.pdf"
|
||
if result.returncode != 0 or not pdf_src.exists():
|
||
log.error("pdflatex failed. Last 3000 chars of output:\n%s", result.stdout[-3000:])
|
||
return False
|
||
|
||
shutil.copy2(pdf_src, output_path)
|
||
return True
|
||
|
||
|
||
# ── Telegram delivery ──────────────────────────────────────────────────────
|
||
def send_pdf(path: Path, caption: str) -> bool:
|
||
"""Upload the PDF to the configured Telegram chat."""
|
||
url = f"https://api.telegram.org/bot{BOT_TOKEN}/sendDocument"
|
||
try:
|
||
with open(path, "rb") as f:
|
||
resp = requests.post(
|
||
url,
|
||
data={"chat_id": CHAT_ID, "caption": caption},
|
||
files={"document": f},
|
||
timeout=60,
|
||
)
|
||
if resp.ok:
|
||
log.info("PDF delivered to Telegram.")
|
||
return True
|
||
log.error("Telegram sendDocument failed: %s — %s", resp.status_code, resp.text)
|
||
return False
|
||
except Exception as exc:
|
||
log.error("Telegram send error: %s", exc)
|
||
return False
|
||
|
||
|
||
def send_message(text: str) -> None:
|
||
"""Send a plain-text message to Telegram (used for error alerts)."""
|
||
url = f"https://api.telegram.org/bot{BOT_TOKEN}/sendMessage"
|
||
try:
|
||
requests.post(url, json={"chat_id": CHAT_ID, "text": text}, timeout=15)
|
||
except Exception as exc:
|
||
log.error("Telegram message error: %s", exc)
|
||
|
||
|
||
# ── Cleanup ────────────────────────────────────────────────────────────────
|
||
def prune_old_pdfs() -> None:
|
||
"""Delete papernews PDFs from OUTPUT_DIR older than RETENTION_DAYS."""
|
||
cutoff = datetime.now() - timedelta(days=RETENTION_DAYS)
|
||
for pdf in OUTPUT_DIR.glob("papernews-*.pdf"):
|
||
if datetime.fromtimestamp(pdf.stat().st_mtime) < cutoff:
|
||
pdf.unlink()
|
||
log.info("Pruned: %s", pdf.name)
|
||
|
||
|
||
# ── Main pipeline ──────────────────────────────────────────────────────────
|
||
def run() -> None:
|
||
"""Full pipeline: fetch → summarize → render → deliver → prune."""
|
||
now = datetime.now(CT)
|
||
date_str = now.strftime("%A, %B %-d, %Y")
|
||
pdf_path = OUTPUT_DIR / f"papernews-{now.strftime('%Y-%m-%d')}.pdf"
|
||
|
||
log.info("Starting Papernews run for %s", date_str)
|
||
client = anthropic.Anthropic(api_key=ANTHROPIC_KEY)
|
||
|
||
body_parts = []
|
||
for section in SECTIONS:
|
||
log.info("Processing section: %s", section["title"])
|
||
articles = fetch_articles(section["feeds"])
|
||
log.info(" Fetched %d articles", len(articles))
|
||
stories = summarize_section(client, section["title"], articles)
|
||
log.info(" Summarized to %d stories", len(stories))
|
||
if stories:
|
||
body_parts.append(build_section_latex(section["title"], stories))
|
||
|
||
if not body_parts:
|
||
msg = "Papernews: no content generated today — check container logs."
|
||
log.error(msg)
|
||
send_message(msg)
|
||
return
|
||
|
||
# Substitute location, date, and body into the template using plain str.replace()
|
||
# so no special characters in the content can interfere with formatting.
|
||
latex = (
|
||
LATEX_TEMPLATE
|
||
.replace("PAPERLOCATION", LOCATION)
|
||
.replace("PAPERDATE", date_str)
|
||
.replace("PAPERBODY", "\n\n".join(body_parts))
|
||
)
|
||
|
||
if not compile_pdf(latex, pdf_path):
|
||
send_message("Papernews: LaTeX compile failed — check container logs.")
|
||
return
|
||
|
||
if not send_pdf(pdf_path, f"Papernews — {date_str}"):
|
||
log.error("PDF compiled but Telegram delivery failed: %s", pdf_path)
|
||
return
|
||
|
||
prune_old_pdfs()
|
||
log.info("Papernews complete: %s", pdf_path.name)
|
||
|
||
|
||
# ── Entry point ────────────────────────────────────────────────────────────
|
||
if __name__ == "__main__":
|
||
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||
|
||
if RUN_NOW:
|
||
log.info("RUN_NOW=true — running immediately.")
|
||
run()
|
||
|
||
scheduler = BlockingScheduler(timezone=CT)
|
||
scheduler.add_job(run, "cron", hour=SCHEDULE_HOUR, minute=SCHEDULE_MIN, id="papernews_daily")
|
||
log.info("Papernews scheduled for %02d:%02d daily.", SCHEDULE_HOUR, SCHEDULE_MIN)
|
||
scheduler.start()
|