#!/usr/bin/env python3
"""
Fill in city, state, is_repost, repost_of columns in protest_posts_master.csv
"""

import csv
import json
import re
import difflib
import os

CSV_PATH = '/home/laddy/Documents/KnowledgeBases/topics/no-kings-3-28/protest_posts_master.csv'
MELTWATER_PATH = '/home/laddy/Documents/KnowledgeBases/topics/no-kings-3-28/meltwater-feed.json'

# ── Geo mapping ──────────────────────────────────────────────────────────────
# (handle → (city, state))
GEO_MAP = {
    # National / HQ
    'aftunion':         ('Washington', 'DC'),
    'aftteach':         ('Washington', 'DC'),
    'afthealthcare':    ('Washington', 'DC'),
    'afthigher':        ('Washington', 'DC'),
    'afthighered':      ('Washington', 'DC'),
    'aftacademics':     ('Washington', 'DC'),
    'teachersunify':    ('Washington', 'DC'),
    'teachersunifypc':  ('Washington', 'DC'),  # Teachers Unify Podcast — national

    # Statewide orgs (use state capital or largest city)
    'cftunion':         ('Burlingame', 'CA'),   # California Federation of Teachers
    'ucaft':            ('Oakland', 'CA'),       # UC AFT — statewide CA
    'iftaft':           ('Springfield', 'IL'),
    'floridaea':        ('Tallahassee', 'FL'),
    'aftct':            ('Hartford', 'CT'),
    'aftindiana':       ('Indianapolis', 'IN'),
    'aftkansas':        ('Topeka', 'KS'),
    'aftmass':          ('Boston', 'MA'),
    'aftnewhampshire':  ('Concord', 'NH'),
    'aftmaryland':      ('Baltimore', 'MD'),
    'aft_maryland':     ('Baltimore', 'MD'),
    'aftmichigan':      ('Lansing', 'MI'),
    'aftnj':            ('Trenton', 'NJ'),
    'aftnewjersey':     ('Trenton', 'NJ'),      # AFTNJ Instagram handle
    'aftoregon':        ('Portland', 'OR'),
    'aftnm':            ('Albuquerque', 'NM'),
    'aftutah':          ('Salt Lake City', 'UT'),
    'aftwa':            ('Seattle', 'WA'),
    'aftwisconsin':     ('Madison', 'WI'),
    'texasaft':         ('Austin', 'TX'),
    'ky120unitedaft':   ('Frankfort', 'KY'),
    'lafedteachers':    ('Baton Rouge', 'LA'),
    'oftunion':         ('Columbus', 'OH'),      # Ohio Federation of Teachers

    # City / local unions
    'aft2121':          ('San Francisco', 'CA'),
    'aft6157':          ('San Jose', 'CA'),
    'aftlocal1825':     ('St. Thomas', 'USVI'),
    'aftlocal1828':     ('Ventura', 'CA'),       # Ventura County Federation of College Teachers
    'aftlocal1904':     ('Montclair', 'NJ'),
    'allianceaft':      ('Dallas', 'TX'),
    'atfunion':         ('Albuquerque', 'NM'),
    'atfteachersunion': ('Albuquerque', 'NM'),   # Albuquerque Teachers Federation
    'backusnurses':     ('Norwich', 'CT'),
    'birminghamaft':    ('Birmingham', 'AL'),
    'btubaltimore':     ('Baltimore', 'MD'),      # Baltimore Teachers Union
    'chtu795':          ('Cleveland Heights', 'OH'),
    'cincyteachers':    ('Cincinnati', 'OH'),     # Cincinnati Federation of Teachers
    'cnjscl':           ('Trenton', 'NJ'),
    'ctulocal1':        ('Chicago', 'IL'),
    'ctu279':           ('Cleveland', 'OH'),
    'dcacts':           ('Washington', 'DC'),
    'hft2415':          ('Houston', 'TX'),
    'htf394':           ('Hammond', 'IN'),
    'ithacateachers':   ('Ithaca', 'NY'),
    'kunanurses':       ('Kuna', 'ID'),
    'lajoyaaft':        ('La Joya', 'TX'),
    'mywsna':           ('Seattle', 'WA'),        # Washington State Nurses Association
    'nbft1060':         ('North Bergen', 'NJ'),
    'nehouaft':         ('Houston', 'TX'),
    'nhft933':          ('New Haven', 'CT'),
    'ofnhp':            ('Portland', 'OR'),
    'pccffap':          ('Portland', 'OR'),
    'pft400':           ('Pittsburgh', 'PA'),
    'pftlocal3':        ('Philadelphia', 'PA'),
    'ptu958':           ('Providence', 'RI'),
    'ramapoaft':        ('Mahwah', 'NJ'),
    'rheateachers':     ('', ''),                 # unknown
    'rifthp':           ('Providence', 'RI'),
    'rochesterta':      ('Rochester', 'NY'),
    'ruaaup':           ('New Brunswick', 'NJ'),
    'ruaaup_ptl':       ('New Brunswick', 'NJ'),  # Rutgers Adjunct Faculty Union
    'saalocal67':       ('San Antonio', 'TX'),    # San Antonio Alliance
    'salemteachers':    ('Salem', 'MA'),
    'sft803':           ('Schenectady', 'NY'),
    'socorroaft':       ('Socorro', 'TX'),
    'spfldfedofparas':  ('Springfield', 'MA'),
    'springbranchaft':  ('Houston', 'TX'),        # Spring Branch (Houston suburb/ISD)
    'staunionhall':     ('Syracuse', 'NY'),       # Syracuse Teachers Association
    'williamsvilleta':  ('Williamsville', 'NY'),
    'wstu571':          ('Hillside', 'IL'),

    # Other
    '3151lft':          ('Lompoc', 'CA'),         # Lompoc Federation of Teachers
    'apea_aft':         ('Kodiak', 'AK'),          # Alaska Public Employees Assoc; posts mention Kodiak

    # Catch-all for handles that appear in data but not listed above
    # (will fall through to text-based inference)
}

# Text-based geo heuristics (pattern → (city, state))
# Applied when handle-based lookup fails or returns empty
TEXT_GEO_PATTERNS = [
    (r'\b(San Francisco|SF)\b', 'San Francisco', 'CA'),
    (r'\bSan Jose\b', 'San Jose', 'CA'),
    (r'\bLos Angeles\b', 'Los Angeles', 'CA'),
    (r'\bOakland\b', 'Oakland', 'CA'),
    (r'\bSacramento\b', 'Sacramento', 'CA'),
    (r'\bLompoc\b', 'Lompoc', 'CA'),
    (r'\bVentura\b', 'Ventura', 'CA'),
    (r'\bChicago\b', 'Chicago', 'IL'),
    (r'\bSpringfield\b.*\bIL\b', 'Springfield', 'IL'),
    (r'\bHouston\b', 'Houston', 'TX'),
    (r'\bDallas\b', 'Dallas', 'TX'),
    (r'\bAustin\b', 'Austin', 'TX'),
    (r'\bSan Antonio\b', 'San Antonio', 'TX'),
    (r'\bAlbuquerque\b', 'Albuquerque', 'NM'),
    (r'\bPhiladelphia\b', 'Philadelphia', 'PA'),
    (r'\bPittsburgh\b', 'Pittsburgh', 'PA'),
    (r'\bNew York\b|NYC\b', 'New York', 'NY'),
    (r'\bRochester\b', 'Rochester', 'NY'),
    (r'\bSyracuse\b', 'Syracuse', 'NY'),
    (r'\bWashington.{0,5}D\.?C\.?', 'Washington', 'DC'),
    (r'\bPortland\b', 'Portland', 'OR'),
    (r'\bSeattle\b', 'Seattle', 'WA'),
    (r'\bBoston\b', 'Boston', 'MA'),
    (r'\bSalem\b.*\bMA\b', 'Salem', 'MA'),
    (r'\bSpringfield\b.*\bMA\b', 'Springfield', 'MA'),
    (r'\bProvidence\b', 'Providence', 'RI'),
    (r'\bNew Haven\b', 'New Haven', 'CT'),
    (r'\bHartford\b', 'Hartford', 'CT'),
    (r'\bNorwich\b', 'Norwich', 'CT'),
    (r'\bCleveland Heights\b', 'Cleveland Heights', 'OH'),
    (r'\bCleveland\b', 'Cleveland', 'OH'),
    (r'\bCincinnati\b', 'Cincinnati', 'OH'),
    (r'\bColumbus\b.*\bOH\b', 'Columbus', 'OH'),
    (r'\bBaltimore\b', 'Baltimore', 'MD'),
    (r'\bTrenton\b', 'Trenton', 'NJ'),
    (r'\bNewark\b', 'Newark', 'NJ'),
    (r'\bHammond\b.*\bIN\b', 'Hammond', 'IN'),
    (r'\bIndianapolis\b', 'Indianapolis', 'IN'),
    (r'\bBirmingham\b', 'Birmingham', 'AL'),
    (r'\bMadison\b', 'Madison', 'WI'),
    (r'\bLansing\b', 'Lansing', 'MI'),
    (r'\bTopeka\b', 'Topeka', 'KS'),
    (r'\bSalt Lake\b', 'Salt Lake City', 'UT'),
    (r'\bConcord\b.*\bNH\b', 'Concord', 'NH'),
    (r'\bTallahassee\b', 'Tallahassee', 'FL'),
    (r'\bBaton Rouge\b', 'Baton Rouge', 'LA'),
    (r'\bFrankfort\b', 'Frankfort', 'KY'),
    (r'\bKodiak\b', 'Kodiak', 'AK'),
    (r'\bSt\. Thomas\b', 'St. Thomas', 'USVI'),
]


def get_geo(handle, text):
    """Return (city, state) for a post."""
    handle_lower = handle.lower()
    if handle_lower in GEO_MAP:
        city, state = GEO_MAP[handle_lower]
        if city or state:
            return city, state

    # Fall through to text-based inference
    combined = (text or '').replace('\n', ' ')
    for pattern, city, state in TEXT_GEO_PATTERNS:
        if re.search(pattern, combined, re.IGNORECASE):
            return city, state

    return '', ''


# ── Repost detection ─────────────────────────────────────────────────────────

def normalize_text(text):
    """Strip whitespace for comparison."""
    if not text:
        return ''
    return re.sub(r'\s+', ' ', text.strip()).lower()


def similarity(a, b):
    """Return 0-1 similarity ratio."""
    if not a or not b:
        return 0.0
    return difflib.SequenceMatcher(None, a, b).ratio()


REPOST_PREFIXES = re.compile(
    r'^(shared from|reposted|rt @|repost|via @)',
    re.IGNORECASE
)

# Instagram/Facebook auto-generated image description stubs — not real content
STUB_PATTERN = re.compile(
    r"^[\w']+['']s (profile picture|highlight story picture)$"
    r"|^photo (by|shared by) .{0,60} on .{0,30}\. may be"
    r"|^photo by .{0,60} in ",
    re.IGNORECASE
)


def check_prefix_repost(text):
    """Return True if text looks like an explicit repost marker."""
    return bool(REPOST_PREFIXES.match((text or '').strip()))


def load_meltwater(path):
    """Load Meltwater posts, return list of (handle, normalized_text)."""
    try:
        with open(path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        result = []
        for item in data:
            handle = item.get('handle', '')
            text = normalize_text(item.get('text', ''))
            if text:
                result.append((handle, text))
        return result
    except Exception as e:
        print(f"Warning: could not load Meltwater data: {e}")
        return []


def main():
    meltwater_posts = load_meltwater(MELTWATER_PATH)

    with open(CSV_PATH, 'r', encoding='utf-8', newline='') as f:
        reader = csv.DictReader(f)
        fieldnames = reader.fieldnames
        rows = list(reader)

    # Build index of normalized texts for within-CSV dedup
    # Map normalized_text → (index, handle) of first occurrence
    text_index = {}  # normalized_text → (row_idx, handle)

    # First pass: fill geo, detect explicit reposts, build text index
    for i, row in enumerate(rows):
        handle = row['handle'].strip().lower()
        text = row.get('text', '') or ''
        norm = normalize_text(text)

        # ── Geo ──────────────────────────────────────────────────────────────
        # Always recalculate (idempotent)
        city, state = get_geo(handle, text)
        row['city'] = city
        row['state'] = state

        # ── Clear repost fields (recalculate fresh) ───────────────────────────
        row['is_repost'] = ''
        row['repost_of'] = ''

        # ── Repost: explicit prefix ───────────────────────────────────────────
        if check_prefix_repost(text):
            row['is_repost'] = 'yes'

        # ── Build text index (first occurrence wins) ──────────────────────────
        if norm and norm not in text_index:
            text_index[norm] = (i, handle)

    # Second pass: within-CSV near-duplicate detection
    # Build list of (idx, norm_text, handle) for all rows with meaningful text
    # Skip profile picture stubs and Instagram image-description stubs
    indexed = []
    for i, row in enumerate(rows):
        norm = normalize_text(row.get('text', ''))
        if norm and not STUB_PATTERN.match(norm) and len(norm) > 80:
            indexed.append((i, norm, row['handle'].lower()))

    # Mark near-duplicates (>90% similarity)
    for i in range(len(indexed)):
        idx_i, norm_i, handle_i = indexed[i]
        row_i = rows[idx_i]

        if row_i.get('is_repost') == 'yes':
            continue  # already marked

        for j in range(i):
            idx_j, norm_j, handle_j = indexed[j]
            row_j = rows[idx_j]

            if handle_i == handle_j:
                continue  # same account posting same thing is not a repost

            if norm_i == norm_j:
                # Exact match
                if not row_i.get('is_repost'):
                    row_i['is_repost'] = 'yes'
                    row_i['repost_of'] = handle_j
                break
            elif similarity(norm_i, norm_j) >= 0.90:
                if not row_i.get('is_repost'):
                    row_i['is_repost'] = 'yes'
                    row_i['repost_of'] = handle_j
                break

    # Third pass: cross-platform check against Meltwater
    for row in rows:
        if row.get('is_repost') == 'cross-platform':
            continue  # already done
        norm = normalize_text(row.get('text', ''))
        if not norm or len(norm) < 80 or STUB_PATTERN.match(norm):
            continue

        for mw_handle, mw_norm in meltwater_posts:
            if similarity(norm, mw_norm) >= 0.90:
                # Don't override an intra-CSV repost already found
                if not row.get('is_repost'):
                    row['is_repost'] = 'cross-platform'
                    row['repost_of'] = mw_handle
                break

    # Write back
    with open(CSV_PATH, 'w', encoding='utf-8', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(rows)

    # ── Summary ───────────────────────────────────────────────────────────────
    total = len(rows)
    geo_tagged = sum(1 for r in rows if r.get('city') or r.get('state'))
    reposts = sum(1 for r in rows if r.get('is_repost') == 'yes')
    cross_platform = sum(1 for r in rows if r.get('is_repost') == 'cross-platform')
    cities = set(r['city'] for r in rows if r.get('city'))
    states = set(r['state'] for r in rows if r.get('state'))

    print(f"\n{'='*50}")
    print(f"  SUMMARY")
    print(f"{'='*50}")
    print(f"  Total posts:             {total}")
    print(f"  Posts with geo-tags:     {geo_tagged}")
    print(f"  Intra-CSV reposts:       {reposts}")
    print(f"  Cross-platform reposts:  {cross_platform}")
    print(f"  Unique cities:           {len(cities)}")
    print(f"  Unique states:           {len(states)}")
    print(f"\n  Cities: {', '.join(sorted(cities))}")
    print(f"\n  States: {', '.join(sorted(states))}")
    print(f"{'='*50}\n")


if __name__ == '__main__':
    main()