#!/usr/bin/env python3
"""Convert crawled FB/IG markdown posts into a JSON feed for monitor.html.

Reads all results/posts/*.md files, extracts individual posts,
and outputs fbig-feed.json in the same format as meltwater-feed.json.

For posts with image descriptions (Instagram alt text), extracts those as media hints.
For text-only posts, includes a screenshot_url field if a screenshot exists.

Usage: python3 build_fbig_feed.py [--watch]
  --watch: re-run every 60 seconds to pick up new crawl results
"""

import json
import os
import re
import sys
import time
from datetime import datetime
from pathlib import Path

POSTS_DIR = Path(__file__).parent / "results" / "posts"
OUTPUT = Path(__file__).parent / "fbig-feed.json"
SCREENSHOTS_DIR = Path(__file__).parent / "results" / "screenshots"


def parse_frontmatter(content):
    """Extract YAML frontmatter from markdown."""
    fm = {}
    if content.startswith("---"):
        end = content.find("---", 3)
        if end > 0:
            for line in content[3:end].strip().split("\n"):
                if ":" in line:
                    key, val = line.split(":", 1)
                    fm[key.strip()] = val.strip().strip('"').strip("'")
    return fm


def unescape_content(body):
    """Unescape JSON-encoded strings that agent-browser returns."""
    body = body.strip()
    if body.startswith('"') and body.endswith('"'):
        try:
            body = json.loads(body)
        except Exception:
            body = body[1:-1]
    body = body.replace("\\n", "\n").replace("\\t", "\t")
    return body


PROTEST_RE = re.compile(
    r"no.?kings|nokings|protest|march\s+28|rally|demonstrat|strike|solidarity|stand\s+up|fight\s+back|resist",
    re.IGNORECASE,
)


def try_parse_json_body(body):
    """Try to parse the body as JSON (new image-enriched format)."""
    body = body.strip()
    # Strip markdown heading
    body = re.sub(r"^#[^\n]*\n+", "", body)
    body = body.strip()

    # Try direct JSON parse (array or object)
    for candidate in [body]:
        try:
            return json.loads(candidate)
        except (json.JSONDecodeError, ValueError):
            pass

    # Try unescaping first
    unescaped = unescape_content(body)
    try:
        return json.loads(unescaped)
    except (json.JSONDecodeError, ValueError):
        pass

    return None


def extract_posts_from_md(filepath):
    """Extract individual posts from a crawled markdown file."""
    content = filepath.read_text(errors="replace")
    fm = parse_frontmatter(content)

    handle = fm.get("handle", filepath.stem.split("_", 1)[-1])
    platform = fm.get("platform", "unknown")
    url = fm.get("url", "")
    org = fm.get("org", handle)

    # Strip frontmatter
    body = content
    if content.startswith("---"):
        end = content.find("---", 3)
        if end > 0:
            body = content[end + 3:]

    posts = []

    # Try new JSON format first (from image-enriched recrawl)
    parsed = try_parse_json_body(body)

    if parsed is not None:
        # New format: JSON array (facebook) or object with .posts (instagram)
        raw_posts = []
        if isinstance(parsed, list):
            raw_posts = parsed
        elif isinstance(parsed, dict) and "posts" in parsed:
            raw_posts = parsed["posts"]
            if not org or org == handle:
                org = parsed.get("displayName", org)

        for i, p in enumerate(raw_posts):
            text = p.get("text", "")
            if not text or len(text) < 20:
                continue

            has_protest = bool(PROTEST_RE.search(text))

            # Media with real URLs from the recrawl
            media = []
            for m in p.get("media", []):
                img_url = m.get("url", "")
                if img_url and ("scontent" in img_url or "cdninstagram" in img_url or "fbcdn" in img_url):
                    media.append({
                        "type": m.get("type", "image"),
                        "url": img_url,
                        "alt": m.get("alt", "")[:200],
                    })

            # Parse date
            post_date = p.get("date", "")
            created = fm.get("crawled", datetime.now().isoformat())
            if post_date:
                try:
                    created = datetime.strptime(post_date, "%B %d, %Y").isoformat()
                except ValueError:
                    pass

            post_id = f"fbig-{platform[:2]}-{handle}-{i}-{abs(hash(text)) % 100000}"
            posts.append({
                "id": post_id,
                "platform": platform,
                "source": "fbig",
                "handle": handle,
                "displayName": org,
                "avatar": "",
                "text": text[:1000],
                "hashtags": p.get("hashtags", re.findall(r"#(\w+)", text)),
                "likes": p.get("likes", "0"),
                "comments": p.get("comments", "0"),
                "shares": p.get("shares", "0"),
                "reposts": 0, "replies": 0, "quotes": 0,
                "createdAt": created,
                "timeHint": p.get("timeHint", ""),
                "originalUrl": url or f"https://www.{'facebook.com' if platform == 'facebook' else 'instagram.com'}/{handle}",
                "media": media,
                "has_protest": has_protest,
                "sentiment": "Positive" if has_protest else "",
            })

        return posts

    # Fallback: old text format (pre-image crawl)
    body = unescape_content(body)
    body = re.sub(r"^#[^\n]*\n+", "", body.strip())

    if platform == "instagram":
        sections = re.split(r"\n---\n", body)
        for i, section in enumerate(sections):
            section = section.strip()
            if not section or len(section) < 30:
                continue
            if section.startswith("## Profile") or section.startswith("## Recent"):
                continue
            if "followers" in section and "following" in section and len(section) < 200:
                continue

            has_protest = bool(PROTEST_RE.search(section))
            date_match = re.search(r"on ((?:March|February|January|April|May) \d+, \d{4})", section)
            post_date = ""
            if date_match:
                try:
                    post_date = datetime.strptime(date_match.group(1), "%B %d, %Y").isoformat()
                except ValueError:
                    pass

            is_image_desc = section.startswith("Photo by ") or "May be " in section[:100]

            post_id = f"fbig-ig-{handle}-{i}-{abs(hash(section)) % 100000}"
            posts.append({
                "id": post_id,
                "platform": "instagram",
                "source": "fbig",
                "handle": handle,
                "displayName": org,
                "avatar": "",
                "text": section[:1000],
                "hashtags": re.findall(r"#(\w+)", section),
                "likes": 0, "reposts": 0, "replies": 0, "quotes": 0,
                "createdAt": post_date or fm.get("crawled", datetime.now().isoformat()),
                "originalUrl": f"https://www.instagram.com/{handle}/",
                "media": [{"type": "image", "url": "", "alt": section[:200]}] if is_image_desc else [],
                "has_protest": has_protest,
                "sentiment": "Positive" if has_protest else "",
            })

    elif platform == "facebook":
        raw_posts = re.split(r"\n*---POST---\n*", body)
        for i, post_text in enumerate(raw_posts):
            post_text = post_text.strip()
            if not post_text or len(post_text) < 30:
                continue
            if "Sign Up" in post_text and "Log In" in post_text:
                continue
            if post_text.startswith("# "):
                continue
            if "Verified account" in post_text and "followers" in post_text:
                continue

            has_protest = bool(PROTEST_RE.search(post_text))

            post_id = f"fbig-fb-{handle}-{i}-{abs(hash(post_text)) % 100000}"
            posts.append({
                "id": post_id,
                "platform": "facebook",
                "source": "fbig",
                "handle": handle,
                "displayName": org,
                "avatar": "",
                "text": post_text[:1000],
                "hashtags": re.findall(r"#(\w+)", post_text),
                "likes": 0, "reposts": 0, "replies": 0, "quotes": 0,
                "createdAt": fm.get("crawled", datetime.now().isoformat()),
                "originalUrl": f"https://www.facebook.com/{handle}",
                "media": [],
                "has_protest": has_protest,
                "sentiment": "Positive" if has_protest else "",
            })

    return posts


def build_feed():
    """Build the complete feed from all crawled files."""
    all_posts = []

    if not POSTS_DIR.exists():
        print("No posts directory found")
        return []

    for md_file in sorted(POSTS_DIR.glob("*.md")):
        if md_file.is_dir():
            continue
        try:
            posts = extract_posts_from_md(md_file)
            all_posts.extend(posts)
        except Exception as e:
            print(f"  Error processing {md_file.name}: {e}")

    # Filter: only include posts with protest content
    protest_posts = [p for p in all_posts if p.get("has_protest")]

    # Remove the has_protest flag from output
    for p in protest_posts:
        del p["has_protest"]
    for p in all_posts:
        if "has_protest" in p:
            del p["has_protest"]

    return protest_posts


def build_csv(posts, output_path):
    """Write posts to CSV for the chronicle to read."""
    import csv

    fields = [
        "handle", "platform", "org", "profile_url", "text",
        "image_urls", "has_image", "image_count", "hashtags",
        "date_hint", "crawled", "city", "state", "is_repost", "repost_of",
        "post_time", "original_url", "engagement_note",
        "likes", "comments", "shares",
    ]

    with open(output_path, "w", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=fields, extrasaction="ignore")
        writer.writeheader()
        for p in posts:
            row = {
                "handle": p.get("handle", ""),
                "platform": p.get("platform", ""),
                "org": p.get("displayName", ""),
                "profile_url": p.get("originalUrl", ""),
                "text": p.get("text", ""),
                "image_urls": "|".join(m.get("url", "") for m in p.get("media", []) if m.get("url")),
                "has_image": "yes" if any(m.get("url") for m in p.get("media", [])) else "no",
                "image_count": sum(1 for m in p.get("media", []) if m.get("url")),
                "hashtags": ",".join(p.get("hashtags", [])),
                "date_hint": p.get("timeHint", ""),
                "crawled": p.get("createdAt", ""),
                "city": "",
                "state": "",
                "is_repost": "",
                "repost_of": "",
                "post_time": p.get("timeHint", ""),
                "original_url": p.get("originalUrl", ""),
                "engagement_note": "",
                "likes": p.get("likes", "0"),
                "comments": p.get("comments", "0"),
                "shares": p.get("shares", "0"),
            }
            writer.writerow(row)


def main():
    watch = "--watch" in sys.argv
    include_all = "--all" in sys.argv

    while True:
        posts = build_feed()
        print(f"[{datetime.now().strftime('%H:%M:%S')}] Generated {len(posts)} protest posts from FB/IG crawls")

        with open(OUTPUT, "w") as f:
            json.dump(posts, f, indent=2, default=str)

        print(f"  Written to {OUTPUT}")

        if not watch:
            break

        print(f"  Watching for changes (60s interval)...")
        time.sleep(60)


if __name__ == "__main__":
    main()
