
#!/usr/bin/env python3
"""
#OpTanzania Scraper
- Modes: news, social, blackouts
- Saves JSON lines into raw_data/{news|social|blackouts}
- Downloads media (optional via yt-dlp for video)
"""
import os, sys, json, time, argparse, hashlib
from datetime import datetime, timezone
from urllib.parse import urljoin, urlparse
import requests
from bs4 import BeautifulSoup
import yaml
from dateutil import parser as dtp
from tqdm import tqdm

def sha256(s: bytes) -> str:
    h = hashlib.sha256(); h.update(s); return h.hexdigest()

def load_config(path="config.yaml"):
    if not os.path.exists(path):
        print("Config not found. Copy config.sample.yaml to config.yaml and edit.", file=sys.stderr)
        sys.exit(1)
    with open(path, "r", encoding="utf-8") as f:
        return yaml.safe_load(f)

def ensure_dir(p):
    os.makedirs(p, exist_ok=True)

def save_jsonl(records, out_path):
    ensure_dir(os.path.dirname(out_path))
    with open(out_path, "a", encoding="utf-8") as f:
        for r in records:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")

def fetch_rss(url):
    # Lightweight RSS fetcher (no external deps beyond bs4/requests)
    r = requests.get(url, timeout=20); r.raise_for_status()
    soup = BeautifulSoup(r.text, "xml")
    items = []
    for item in soup.find_all(["item", "entry"]):
        title = item.find("title").get_text(strip=True) if item.find("title") else ""
        link_tag = item.find("link")
        link = link_tag.get("href") if link_tag and link_tag.has_attr("href") else (link_tag.get_text(strip=True) if link_tag else "")
        pub = item.find("pubDate") or item.find("updated") or item.find("published")
        pub_ts = dtp.parse(pub.get_text(strip=True)).isoformat() if pub else None
        desc = item.find("description") or item.find("summary")
        items.append({
            "title": title,
            "link": link,
            "published": pub_ts,
            "summary": desc.get_text(strip=True) if desc else None
        })
    return items

def fetch_html_listing(url, list_selector, href_attr="href", title_attr=None, limit=50):
    r = requests.get(url, timeout=20); r.raise_for_status()
    soup = BeautifulSoup(r.text, "lxml")
    out = []
    for a in soup.select(list_selector)[:limit]:
        href = a.get(href_attr) if href_attr else a.get("href")
        if not href: continue
        link = urljoin(url, href)
        title = a.get(title_attr) if title_attr else (a.get_text(strip=True) or link)
        out.append({"title": title, "link": link, "discovered_at": datetime.now(timezone.utc).isoformat()})
    return out

def mode_news(cfg):
    records = []
    for src in cfg.get("news_sources", []):
        try:
            if src["type"] == "rss":
                for item in fetch_rss(src["url"]):
                    item["source"] = src["name"]
                    records.append(item)
            elif src["type"] == "html":
                items = fetch_html_listing(src["url"], src.get("list_selector","a"), src.get("href_attr","href"), src.get("title_attr"))
                for it in items:
                    it["source"] = src["name"]
                    records.append(it)
        except Exception as e:
            records.append({"error": str(e), "source": src.get("name"), "url": src.get("url")})
    save_jsonl(records, "raw_data/news/news.jsonl")
    print(f"[news] Saved {len(records)} records")

def twitter_collect(cfg, since=None):
    tw = cfg.get("social", {}).get("twitter", {})
    if not tw.get("enabled"):
        print("[social] Twitter disabled in config")
        return []
    token = tw.get("bearer_token")
    if not token or token == "YOUR_TWITTER_BEARER_TOKEN":
        print("[social] Missing bearer token; skipping")
        return []
    headers = {"Authorization": f"Bearer {token}"}
    url = "https://api.twitter.com/2/tweets/search/recent"
    params = {
        "query": tw.get("query", ""),
        "tweet.fields": tw.get("tweet_fields","created_at,public_metrics,lang,source"),
        "expansions": tw.get("expansions","author_id"),
        "user.fields": tw.get("user_fields","username,public_metrics,verified"),
        "max_results": min(int(tw.get("max_results",50)), 100),
    }
    if since:
        params["start_time"] = dtp.parse(since).isoformat()
    r = requests.get(url, headers=headers, params=params, timeout=30)
    if r.status_code != 200:
        print(f"[social] Twitter API error: {r.status_code} {r.text}")
        return []
    data = r.json()
    return [data]

def mode_social(cfg, since=None):
    records = []
    # Twitter (optional)
    try:
        records += twitter_collect(cfg, since=since)
    except Exception as e:
        records.append({"error": str(e), "source": "twitter"})
    save_jsonl(records, "raw_data/social/social.jsonl")
    print(f"[social] Saved {len(records)} records")

def mode_blackouts(cfg):
    records = []
    for src in cfg.get("blackout_monitors", []):
        try:
            items = fetch_html_listing(src["url"], src.get("list_selector","a"), src.get("href_attr","href"))
            for it in items:
                it["source"] = src["name"]
                records.append(it)
        except Exception as e:
            records.append({"error": str(e), "source": src.get("name"), "url": src.get("url")})
    save_jsonl(records, "raw_data/blackouts/blackouts.jsonl")
    print(f"[blackouts] Saved {len(records)} records")

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--mode", choices=["news","social","blackouts"], required=True)
    ap.add_argument("--since", help="ISO date/time for social start")
    ap.add_argument("--config", default="config.yaml")
    args = ap.parse_args()

    cfg = load_config(args.config)

    if args.mode == "news":
        mode_news(cfg)
    elif args.mode == "social":
        mode_social(cfg, since=args.since)
    elif args.mode == "blackouts":
        mode_blackouts(cfg)

if __name__ == "__main__":
    main()
