#!/usr/bin/env python3
"""
Clean protest_posts_master.csv:
1. Filter to No Kings / March 28 protest posts only
2. Aggressive deduplication (same handle >80%, cross-handle >85%)
3. Add post_time, original_url, engagement_note columns
"""

import csv
import re
import sys
from difflib import SequenceMatcher

INPUT_PATH = '/home/laddy/Documents/KnowledgeBases/topics/no-kings-3-28/protest_posts_master.csv'
OUTPUT_PATH = '/home/laddy/Documents/KnowledgeBases/topics/no-kings-3-28/protest_posts_master.csv'


# ---------------------------------------------------------------------------
# 1. Relevance filtering
# ---------------------------------------------------------------------------

# Primary: clearly about No Kings / March 28 Day of Action
PRIMARY_PATTERN = re.compile(
    r'(?:'
    r'no[\s_-]?kings?'
    r'|nokings'
    r'|#nokings'
    r'|nokingsday'
    r'|march\s+28'
    r'|3/28'
    r'|day\s+of\s+action'
    r'|no\s+rulers?'
    r'|no\s+crowns?'
    r'|no\s+thrones?'
    r'|nokings\.org'
    r'|50501movement'
    r'|standup_america'
    r'|mobilize\.us/nokings'
    r')',
    re.IGNORECASE,
)

# Secondary: strong contextual signals that, combined with union context, suggest No Kings.
# These must be specific enough to avoid false positives from unrelated union activity.
SECONDARY_PATTERN = re.compile(
    r'(?:'
    r'authoritarian\s+power\s+grab'
    r'|anti.?fascis'
    r'|no\s+ice.*no\s+war'
    r'|power\s+belongs\s+to\s+the\s+people'
    r'|biggest\s+single\s+day\s+of\s+protest'
    r'|millions.*took\s+to\s+the\s+streets'
    r'|eight\s+million.*streets'
    r'|hundreds\s+of\s+thousands.*streets'
    r'|#nokingsday'
    r')',
    re.IGNORECASE,
)

# Instagram image-text patterns for No Kings flyers
INSTAGRAM_NO_KINGS_PATTERN = re.compile(
    r'(?:'
    r'no\s+kings\s+saturday'
    r'|no\s+kings.*march\s+2[0-9]'
    r'|march\s+2[0-9].*no\s+kings'
    r'|protest\s+with\s+your\s+uni[o0]n\s+march\s+28'
    r'|march\s+28.*power\s+belongs'
    r'|power\s+belongs.*people.*march'
    r'|no\s+kings\s+power\s+belongs'
    r'|no\s+kings.*join\s+the\s+movement'
    r'|this\s+country\s+was\s+built\s+by\s+unions'
    r'|#nokings'
    r')',
    re.IGNORECASE,
)

# Posts to EXCLUDE regardless of any match (noise patterns)
EXCLUDE_PATTERN = re.compile(
    r'(?:'
    r'fix\s+tier\s+6'
    r'|hb\s*1492'
    r'|right\s+to\s+strike\s+bill'
    r'|praxis\s+bootcamp'
    r'|professional\s+development'
    r'|share\s+my\s+lesson'
    r'|sml\s+virtual\s+conference'
    r'|webinar'
    r'|march\s+in\s+march\s+bliss'           # CFT march in march is CA state capitol, not NoKings
    r')',
    re.IGNORECASE,
)

# Posts known to be about other marches/protests (not No Kings March 28)
# These were manually reviewed as ambiguous but NOT No Kings
NON_NOKINGS_SPECIFIC = {
    # rochesterta Fix Tier 6 rally
    # btubaltimore HB1492 rally
    # afthighered March 4th day of action (education funding, not No Kings)
    # pftlocal3 AFT Day of Action (school leafletting, pre-NoKings)
    # cftunion "March in March" at state capitol
}


def is_protest_post(row: dict) -> tuple[bool, str]:
    """Return (keep, note) for a row."""
    text = (row.get('text') or '') + ' ' + (row.get('hashtags') or '')

    # Hard exclude
    if EXCLUDE_PATTERN.search(text):
        return False, ''

    # Primary keywords: always keep
    if PRIMARY_PATTERN.search(text):
        return True, ''

    # Instagram alt-text with No Kings flyer descriptions
    if INSTAGRAM_NO_KINGS_PATTERN.search(text):
        return True, ''

    # Secondary signals: context-dependent
    if SECONDARY_PATTERN.search(text):
        return True, 'secondary signal — specific No Kings protest language'

    # aft_maryland "not to any one man" democracy framing: keep if explicitly framing
    # against one-person rule (No Kings messaging even if not the exact date)
    if row.get('handle') == 'aft_maryland' and re.search(
        r'not\s+to\s+any\s+one\s+man|belongs\s+to\s+its\s+people.*not.*one\s+man',
        text, re.IGNORECASE
    ):
        return True, 'No Kings messaging — "not to any one man" framing'

    return False, ''


# ---------------------------------------------------------------------------
# 2. Deduplication helpers
# ---------------------------------------------------------------------------

def normalize_text(text: str) -> str:
    """Lowercase, collapse whitespace for similarity comparison."""
    return re.sub(r'\s+', ' ', text.lower().strip())


def similarity(a: str, b: str) -> float:
    """Return similarity ratio between two strings (0-1)."""
    a_norm = normalize_text(a)
    b_norm = normalize_text(b)
    if not a_norm or not b_norm:
        return 0.0
    return SequenceMatcher(None, a_norm, b_norm, autojunk=False).ratio()


# ---------------------------------------------------------------------------
# 3. Time / engagement extraction
# ---------------------------------------------------------------------------

TIME_PATTERN = re.compile(
    r'\b(\d{1,2}(?::\d{2})?\s*(?:am|pm|a\.m\.|p\.m\.))\b',
    re.IGNORECASE,
)

ENGAGEMENT_PATTERN = re.compile(
    r'(\d[\d,]*)\s*(?:likes?|comments?|shares?|reposts?|reactions?)',
    re.IGNORECASE,
)


def extract_post_time(row: dict) -> str:
    """Extract time hint from text or date_hint."""
    combined = (row.get('text') or '') + ' ' + (row.get('date_hint') or '')
    m = TIME_PATTERN.search(combined)
    if m:
        return m.group(1).strip()
    return ''


def extract_engagement_note(row: dict) -> str:
    """Extract engagement numbers mentioned in text."""
    text = row.get('text') or ''
    hits = ENGAGEMENT_PATTERN.findall(text)
    if hits:
        return '; '.join(hits)
    return ''


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------

def main():
    with open(INPUT_PATH, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        original_fieldnames = reader.fieldnames
        rows = list(reader)

    total_before = len(rows)

    # Step 1: Filter to protest posts
    kept_rows = []
    removed_not_protest = 0
    ambiguous_notes = {}

    for row in rows:
        keep, note = is_protest_post(row)
        if keep:
            kept_rows.append(row)
            if note:
                ambiguous_notes[id(row)] = note
        else:
            removed_not_protest += 1

    # Step 2: Deduplication
    # Same-handle dedup: >80% similarity → keep longer
    # Group by handle
    from collections import defaultdict
    by_handle = defaultdict(list)
    for row in kept_rows:
        by_handle[row['handle']].append(row)

    deduped_rows = []
    removed_same_handle = 0

    for handle, handle_rows in by_handle.items():
        # Compare each pair within the handle
        to_remove = set()
        for i in range(len(handle_rows)):
            if i in to_remove:
                continue
            for j in range(i + 1, len(handle_rows)):
                if j in to_remove:
                    continue
                sim = similarity(handle_rows[i]['text'], handle_rows[j]['text'])
                if sim > 0.80:
                    # Keep longer text
                    len_i = len(handle_rows[i]['text'])
                    len_j = len(handle_rows[j]['text'])
                    if len_i >= len_j:
                        to_remove.add(j)
                    else:
                        to_remove.add(i)
        for i, row in enumerate(handle_rows):
            if i not in to_remove:
                deduped_rows.append(row)
            else:
                removed_same_handle += 1

    # Cross-handle dedup: >85% similarity → mark shorter as repost
    reposts_identified = 0
    n = len(deduped_rows)
    is_repost_flags = {}  # index → (is_repost, repost_of)

    for i in range(n):
        if i in is_repost_flags:
            continue
        for j in range(i + 1, n):
            if j in is_repost_flags:
                continue
            r_i = deduped_rows[i]
            r_j = deduped_rows[j]
            # Only cross-handle
            if r_i['handle'] == r_j['handle']:
                continue
            sim = similarity(r_i['text'], r_j['text'])
            if sim > 0.85:
                len_i = len(r_i['text'])
                len_j = len(r_j['text'])
                # Mark shorter as repost of longer
                if len_j <= len_i:
                    is_repost_flags[j] = ('yes', r_i['handle'])
                    reposts_identified += 1
                else:
                    is_repost_flags[i] = ('yes', r_j['handle'])
                    reposts_identified += 1

    # Apply repost flags
    for idx, (flag, of_handle) in is_repost_flags.items():
        deduped_rows[idx]['is_repost'] = flag
        deduped_rows[idx]['repost_of'] = of_handle

    # Step 3: Add new columns
    new_fieldnames = list(original_fieldnames)
    for col in ['post_time', 'original_url', 'engagement_note']:
        if col not in new_fieldnames:
            new_fieldnames.append(col)

    for row in deduped_rows:
        row['post_time'] = extract_post_time(row)
        row['original_url'] = row.get('profile_url', '')
        row['engagement_note'] = extract_engagement_note(row)

    # Write output
    with open(OUTPUT_PATH, 'w', encoding='utf-8', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=new_fieldnames, extrasaction='ignore')
        writer.writeheader()
        writer.writerows(deduped_rows)

    total_after = len(deduped_rows)
    duplicates_removed = removed_same_handle

    print('=' * 55)
    print('PROTEST POSTS CSV CLEANING SUMMARY')
    print('=' * 55)
    print(f'Posts before cleaning:       {total_before:>6}')
    print(f'Posts after cleaning:        {total_after:>6}')
    print(f'Posts removed (off-topic):   {removed_not_protest:>6}')
    print(f'Duplicates removed (same-handle >80%): {removed_same_handle:>3}')
    print(f'Reposts identified (cross-handle >85%): {reposts_identified:>2}')
    print(f'Output written to: {OUTPUT_PATH}')
    print('=' * 55)


if __name__ == '__main__':
    main()