yrtv/ETL/verify/verify_L2.py

import sqlite3
import pandas as pd
import csv
import os
import sys
import time

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

db_path = 'database/L2/L2_Main.sqlite'
schema_path = 'database/original_json_schema/schema_flat.csv'

covered_main_fields = {
    "match_code", "map", "start_time", "end_time", "match_winner",
    "group1_all_score", "group1_change_elo", "group1_fh_role", "group1_fh_score",
    "group1_origin_elo", "group1_sh_role", "group1_sh_score", "group1_tid", "group1_uids",
    "group2_all_score", "group2_change_elo", "group2_fh_role", "group2_fh_score",
    "group2_origin_elo", "group2_sh_role", "group2_sh_score", "group2_tid", "group2_uids",
    "server_ip", "server_port", "location", "location_full", "map_desc",
    "demo_url", "game_mode", "game_name", "match_mode", "match_status", "match_flag",
    "status", "waiver", "year", "season", "round_total", "cs_type", "priority_show_type",
    "pug10m_show_type", "credit_match_status", "knife_winner", "knife_winner_role",
    "most_1v2_uid", "most_assist_uid", "most_awp_uid", "most_end_uid",
    "most_first_kill_uid", "most_headshot_uid", "most_jump_uid", "mvp_uid", "id"
}
covered_user_fields = {
    "data.group_N[].user_info."
}
covered_round_fields = [
    "data.round_list[].current_score.ct",
    "data.round_list[].current_score.t",
    "data.round_list[].current_score.final_round_time",
    "data.round_list[].all_kill[].pasttime",
    "data.round_list[].all_kill[].weapon",
    "data.round_list[].all_kill[].headshot",
    "data.round_list[].all_kill[].penetrated",
    "data.round_list[].all_kill[].attackerblind",
    "data.round_list[].all_kill[].throughsmoke",
    "data.round_list[].all_kill[].noscope",
    "data.round_list[].all_kill[].attacker.steamid_64",
    "data.round_list[].all_kill[].victim.steamid_64",
    "data.round_list[].all_kill[].attacker.pos.x",
    "data.round_list[].all_kill[].attacker.pos.y",
    "data.round_list[].all_kill[].attacker.pos.z",
    "data.round_list[].all_kill[].victim.pos.x",
    "data.round_list[].all_kill[].victim.pos.y",
    "data.round_list[].all_kill[].victim.pos.z"
]
covered_leetify_fields = [
    "data.leetify_data.round_stat[].round",
    "data.leetify_data.round_stat[].win_reason",
    "data.leetify_data.round_stat[].end_ts",
    "data.leetify_data.round_stat[].sfui_event.score_ct",
    "data.leetify_data.round_stat[].sfui_event.score_t",
    "data.leetify_data.round_stat[].ct_money_group",
    "data.leetify_data.round_stat[].t_money_group",
    "data.leetify_data.round_stat[].show_event[].ts",
    "data.leetify_data.round_stat[].show_event[].kill_event.Ts",
    "data.leetify_data.round_stat[].show_event[].kill_event.Killer",
    "data.leetify_data.round_stat[].show_event[].kill_event.Victim",
    "data.leetify_data.round_stat[].show_event[].kill_event.WeaponName",
    "data.leetify_data.round_stat[].show_event[].kill_event.Headshot",
    "data.leetify_data.round_stat[].show_event[].kill_event.Penetrated",
    "data.leetify_data.round_stat[].show_event[].kill_event.AttackerBlind",
    "data.leetify_data.round_stat[].show_event[].kill_event.ThroughSmoke",
    "data.leetify_data.round_stat[].show_event[].kill_event.NoScope",
    "data.leetify_data.round_stat[].show_event[].trade_score_change.",
    "data.leetify_data.round_stat[].show_event[].flash_assist_killer_score_change.",
    "data.leetify_data.round_stat[].show_event[].killer_score_change.",
    "data.leetify_data.round_stat[].show_event[].victim_score_change.",
    "data.leetify_data.round_stat[].bron_equipment.",
    "data.leetify_data.round_stat[].player_t_score.",
    "data.leetify_data.round_stat[].player_ct_score.",
    "data.leetify_data.round_stat[].player_bron_crash."
]
covered_vip_fields = {
    "awp_kill",
    "awp_kill_ct",
    "awp_kill_t",
    "damage_receive",
    "damage_stats",
    "fd_ct",
    "fd_t",
    "kast"
}

def load_schema_paths(schema_path_value):
    paths = []
    with open(schema_path_value, 'r', encoding='utf-8') as f:
        reader = csv.reader(f)
        _ = next(reader, None)
        for row in reader:
            if len(row) >= 2:
                paths.append(row[1])
    return paths

def is_covered(path):
    if path in ["data", "code", "message", "status", "timestamp", "timeStamp", "traceId", "success", "errcode"]:
        return True
    if path.startswith("data.<steamid>."):
        key = path.split("data.<steamid>.")[1].split(".")[0]
        if key in covered_vip_fields:
            return True
    if "data.group_N[].fight_any." in path:
        return True
    if "data.group_N[].fight_t." in path or "data.group_N[].fight_ct." in path:
        return True
    if "data.group_N[].sts." in path:
        return True
    if "data.group_N[].level_info." in path:
        return True
    if "data.treat_info." in path:
        return True
    if "data.has_side_data_and_rating2" in path:
        return True
    if "data.main." in path:
        key = path.split("data.main.")[1].split(".")[0]
        if key in covered_main_fields:
            return True
    if any(k in path for k in covered_user_fields):
        return True
    if "data.round_list" in path:
        return True
    if any(k in path for k in covered_round_fields):
        return True
    if "data.leetify_data." in path:
        return True
    if any(k in path for k in covered_leetify_fields):
        return True
    return False

def group_key(p):
    if "data.group_N[].user_info." in p:
        return "data.group_N[].user_info.*"
    if "data.group_N[].fight_any." in p:
        return "data.group_N[].fight_any.*"
    if "data.group_N[].fight_t." in p:
        return "data.group_N[].fight_t.*"
    if "data.group_N[].fight_ct." in p:
        return "data.group_N[].fight_ct.*"
    if "data.main." in p:
        return "data.main.*"
    if "data.round_list[]" in p or "data.round_list[]." in p:
        return "data.round_list.*"
    if "data.leetify_data.round_stat[]" in p or "data.leetify_data.round_stat[]." in p:
        return "data.leetify_data.round_stat.*"
    if "data.leetify_data." in p:
        return "data.leetify_data.*"
    if "data.treat_info." in p:
        return "data.treat_info.*"
    if "data." in p:
        return "data.*"
    return "other"

def dump_uncovered(output_path):
    paths = load_schema_paths(schema_path)
    uncovered = [p for p in paths if not is_covered(p)]
    df_unc = pd.DataFrame({"path": uncovered})
    if len(df_unc) == 0:
        print("no uncovered paths")
        return
    df_unc["group"] = df_unc["path"].apply(group_key)
    df_unc = df_unc.sort_values(["group", "path"])
    df_unc.to_csv(output_path, index=False, encoding='utf-8-sig')
    print(f"uncovered total: {len(df_unc)}")
    print("\n-- uncovered groups (count) --")
    print(df_unc.groupby("group").size().sort_values(ascending=False))
    print(f"\noutput: {output_path}")

def print_schema(conn):
    tables = conn.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%' ORDER BY name").fetchall()
    for (name,) in tables:
        print(f"\n[{name}]")
        cols = conn.execute(f"PRAGMA table_info({name})").fetchall()
        rows = [["column", "type", "pk"]]
        for _, col_name, col_type, _, _, pk in cols:
            rows.append([col_name, col_type or "", str(pk)])
        widths = [max(len(r[i]) for r in rows) for i in range(3)]
        for idx, r in enumerate(rows):
            line = " | ".join([r[i].ljust(widths[i]) for i in range(3)])
            print(line)
            if idx == 0:
                print("-" * len(line))

def refresh_schema_sql(conn, output_path):
    rows = conn.execute("""
        SELECT type, name, sql
        FROM sqlite_master
        WHERE sql IS NOT NULL AND type IN ('table', 'index') AND name NOT LIKE 'sqlite_%'
        ORDER BY CASE WHEN type='table' THEN 0 ELSE 1 END, name
    """).fetchall()
    lines = ["PRAGMA foreign_keys = ON;", ""]
    for _, _, sql in rows:
        lines.append(sql.strip() + ";")
        lines.append("")
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write("\n".join(lines).strip() + "\n")

def verify():
    conn = sqlite3.connect(db_path)

    print("--- Counts ---")
    tables = [
        'dim_players',
        'dim_maps',
        'fact_matches',
        'fact_match_teams',
        'fact_match_players',
        'fact_match_players_t',
        'fact_match_players_ct',
        'fact_rounds',
        'fact_round_events',
        'fact_round_player_economy'
    ]
    for t in tables:
        count = conn.execute(f"SELECT COUNT(*) FROM {t}").fetchone()[0]
        print(f"{t}: {count}")

    print("\n--- Data Source Distribution ---")
    dist = pd.read_sql("SELECT data_source_type, COUNT(*) as cnt FROM fact_matches GROUP BY data_source_type", conn)
    print(dist)

    print("\n--- Sample Round Events (Leetify vs Classic) ---")
    # Fetch one event from a leetify match
    leetify_match = conn.execute("SELECT match_id FROM fact_matches WHERE data_source_type='leetify' LIMIT 1").fetchone()
    if leetify_match:
        mid = leetify_match[0]
        print(f"Leetify Match: {mid}")
        df = pd.read_sql(f"SELECT * FROM fact_round_events WHERE match_id='{mid}' AND event_type='kill' LIMIT 1", conn)
        print(df[['event_type', 'attacker_steam_id', 'trade_killer_steam_id', 'attacker_pos_x', 'score_change_attacker']])

    # Fetch one event from a classic match
    classic_match = conn.execute("SELECT match_id FROM fact_matches WHERE data_source_type='classic' LIMIT 1").fetchone()
    if classic_match:
        mid = classic_match[0]
        print(f"Classic Match: {mid}")
        df = pd.read_sql(f"SELECT * FROM fact_round_events WHERE match_id='{mid}' AND event_type='kill' LIMIT 1", conn)
        print(df[['event_type', 'attacker_steam_id', 'trade_killer_steam_id', 'attacker_pos_x', 'score_change_attacker']])

    print("\n--- Sample Player Stats (New Fields) ---")
    df_players = pd.read_sql("SELECT steam_id_64, rating, rating3, elo_change, rank_score, flash_duration, jump_count FROM fact_match_players LIMIT 5", conn)
    print(df_players)

    print("\n--- Insert Field Checks ---")
    meta_counts = conn.execute("""
        SELECT
            SUM(CASE WHEN response_code IS NOT NULL THEN 1 ELSE 0 END) AS response_code_cnt,
            SUM(CASE WHEN response_trace_id IS NOT NULL AND response_trace_id != '' THEN 1 ELSE 0 END) AS response_trace_id_cnt,
            SUM(CASE WHEN response_success IS NOT NULL THEN 1 ELSE 0 END) AS response_success_cnt,
            SUM(CASE WHEN response_errcode IS NOT NULL THEN 1 ELSE 0 END) AS response_errcode_cnt,
            SUM(CASE WHEN treat_info_raw IS NOT NULL AND treat_info_raw != '' THEN 1 ELSE 0 END) AS treat_info_raw_cnt,
            SUM(CASE WHEN round_list_raw IS NOT NULL AND round_list_raw != '' THEN 1 ELSE 0 END) AS round_list_raw_cnt,
            SUM(CASE WHEN leetify_data_raw IS NOT NULL AND leetify_data_raw != '' THEN 1 ELSE 0 END) AS leetify_data_raw_cnt
        FROM fact_matches
    """).fetchone()
    print(f"response_code non-null: {meta_counts[0]}")
    print(f"response_trace_id non-empty: {meta_counts[1]}")
    print(f"response_success non-null: {meta_counts[2]}")
    print(f"response_errcode non-null: {meta_counts[3]}")
    print(f"treat_info_raw non-empty: {meta_counts[4]}")
    print(f"round_list_raw non-empty: {meta_counts[5]}")
    print(f"leetify_data_raw non-empty: {meta_counts[6]}")

    print("\n--- Integrity Checks ---")
    missing_players = conn.execute("""
        SELECT COUNT(*) FROM fact_match_players f
        LEFT JOIN dim_players d ON f.steam_id_64 = d.steam_id_64
        WHERE d.steam_id_64 IS NULL
    """).fetchone()[0]
    print(f"fact_match_players missing dim_players: {missing_players}")

    missing_round_matches = conn.execute("""
        SELECT COUNT(*) FROM fact_rounds r
        LEFT JOIN fact_matches m ON r.match_id = m.match_id
        WHERE m.match_id IS NULL
    """).fetchone()[0]
    print(f"fact_rounds missing fact_matches: {missing_round_matches}")

    missing_event_rounds = conn.execute("""
        SELECT COUNT(*) FROM fact_round_events e
        LEFT JOIN fact_rounds r ON e.match_id = r.match_id AND e.round_num = r.round_num
        WHERE r.match_id IS NULL
    """).fetchone()[0]
    print(f"fact_round_events missing fact_rounds: {missing_event_rounds}")

    side_zero_t = conn.execute("""
        SELECT COUNT(*) FROM fact_match_players_t
        WHERE COALESCE(kills,0)=0 AND COALESCE(deaths,0)=0 AND COALESCE(assists,0)=0
    """).fetchone()[0]
    side_zero_ct = conn.execute("""
        SELECT COUNT(*) FROM fact_match_players_ct
        WHERE COALESCE(kills,0)=0 AND COALESCE(deaths,0)=0 AND COALESCE(assists,0)=0
    """).fetchone()[0]
    print(f"fact_match_players_t zero K/D/A: {side_zero_t}")
    print(f"fact_match_players_ct zero K/D/A: {side_zero_ct}")

    print("\n--- Full vs T/CT Comparison ---")
    cols = [
        'kills', 'deaths', 'assists', 'headshot_count', 'adr', 'rating', 'rating2',
        'rating3', 'rws', 'mvp_count', 'flash_duration', 'jump_count', 'is_win'
    ]
    df_full = pd.read_sql(
        "SELECT match_id, steam_id_64, " + ",".join(cols) + " FROM fact_match_players",
        conn
    )
    df_t = pd.read_sql(
        "SELECT match_id, steam_id_64, " + ",".join(cols) + " FROM fact_match_players_t",
        conn
    ).rename(columns={c: f"{c}_t" for c in cols})
    df_ct = pd.read_sql(
        "SELECT match_id, steam_id_64, " + ",".join(cols) + " FROM fact_match_players_ct",
        conn
    ).rename(columns={c: f"{c}_ct" for c in cols})

    df = df_full.merge(df_t, on=['match_id', 'steam_id_64'], how='left')
    df = df.merge(df_ct, on=['match_id', 'steam_id_64'], how='left')

    def is_empty(s):
        return s.isna() | (s == 0)

    for c in cols:
        empty_count = is_empty(df[c]).sum()
        print(f"{c} empty: {empty_count}")

    additive = ['kills', 'deaths', 'assists', 'headshot_count', 'mvp_count', 'flash_duration', 'jump_count']
    for c in additive:
        t_sum = df[f"{c}_t"].fillna(0) + df[f"{c}_ct"].fillna(0)
        tol = 0.01 if c == 'flash_duration' else 0
        diff = (df[c].fillna(0) - t_sum).abs() > tol
        print(f"{c} full != t+ct: {diff.sum()}")

    non_additive = ['adr', 'rating', 'rating2', 'rating3', 'rws', 'is_win']
    for c in non_additive:
        side_nonempty = (~is_empty(df[f"{c}_t"])) | (~is_empty(df[f"{c}_ct"]))
        full_empty_side_nonempty = is_empty(df[c]) & side_nonempty
        full_nonempty_side_empty = (~is_empty(df[c])) & (~side_nonempty)
        print(f"{c} full empty but side has: {full_empty_side_nonempty.sum()}")
        print(f"{c} full has but side empty: {full_nonempty_side_empty.sum()}")

    print("\n--- Rating Detail ---")
    rating_cols = ['rating', 'rating2', 'rating3']
    for c in rating_cols:
        full_null = df[c].isna().sum()
        full_zero = (df[c] == 0).sum()
        full_nonzero = ((~df[c].isna()) & (df[c] != 0)).sum()
        side_t_nonzero = ((~df[f"{c}_t"].isna()) & (df[f"{c}_t"] != 0)).sum()
        side_ct_nonzero = ((~df[f"{c}_ct"].isna()) & (df[f"{c}_ct"] != 0)).sum()
        side_any_nonzero = ((~df[f"{c}_t"].isna()) & (df[f"{c}_t"] != 0)) | ((~df[f"{c}_ct"].isna()) & (df[f"{c}_ct"] != 0))
        full_nonzero_side_zero = ((~df[c].isna()) & (df[c] != 0) & (~side_any_nonzero)).sum()
        full_zero_side_nonzero = (((df[c].isna()) | (df[c] == 0)) & side_any_nonzero).sum()
        print(f"{c} full null: {full_null} full zero: {full_zero} full nonzero: {full_nonzero}")
        print(f"{c} side t nonzero: {side_t_nonzero} side ct nonzero: {side_ct_nonzero}")
        print(f"{c} full nonzero but side all zero: {full_nonzero_side_zero}")
        print(f"{c} full zero but side has: {full_zero_side_nonzero}")

    df_rating_src = pd.read_sql(
        "SELECT f.rating, f.rating2, f.rating3, m.data_source_type FROM fact_match_players f JOIN fact_matches m ON f.match_id = m.match_id",
        conn
    )
    for c in rating_cols:
        grp = df_rating_src.groupby('data_source_type')[c].apply(lambda s: (s != 0).sum()).reset_index(name='nonzero')
        print(f"{c} nonzero by source")
        print(grp)

    print("\n--- Schema Coverage (fight_any) ---")
    paths = load_schema_paths(schema_path)
    fight_keys = set()
    for p in paths:
        if 'data.group_N[].fight_any.' in p:
            key = p.split('fight_any.')[1].split('.')[0]
            fight_keys.add(key)
    l2_cols = set(pd.read_sql("PRAGMA table_info(fact_match_players)", conn)['name'].tolist())
    alias = {
        'kills': 'kill',
        'deaths': 'death',
        'assists': 'assist',
        'headshot_count': 'headshot',
        'mvp_count': 'is_mvp',
        'flash_duration': 'flash_enemy_time',
        'jump_count': 'jump_total',
        'awp_kills': 'awp_kill'
    }
    covered = set()
    for c in l2_cols:
        if c in fight_keys:
            covered.add(c)
        elif c in alias and alias[c] in fight_keys:
            covered.add(alias[c])
    missing_keys = sorted(list(fight_keys - covered))
    print(f"fight_any keys: {len(fight_keys)}")
    print(f"covered by L2 columns: {len(covered)}")
    print(f"uncovered fight_any keys: {len(missing_keys)}")
    if missing_keys:
        print(missing_keys)

    print("\n--- Coverage Zero Rate (fight_any -> fact_match_players) ---")
    fight_cols = [k for k in fight_keys if k in l2_cols or k in alias.values()]
    col_map = {}
    for k in fight_cols:
        if k in l2_cols:
            col_map[k] = k
        else:
            for l2k, src in alias.items():
                if src == k:
                    col_map[k] = l2k
                    break
    select_cols = ["steam_id_64"] + list(set(col_map.values()))
    df_fight = pd.read_sql(
        "SELECT " + ",".join(select_cols) + " FROM fact_match_players",
        conn
    )
    total_rows = len(df_fight)
    stats = []
    for fight_key, col in sorted(col_map.items()):
        s = df_fight[col]
        zeros = (s == 0).sum()
        nulls = s.isna().sum()
        nonzero = total_rows - zeros - nulls
        stats.append({
            "fight_key": fight_key,
            "column": col,
            "nonzero": nonzero,
            "zero": zeros,
            "null": nulls,
            "zero_rate": 0 if total_rows == 0 else round(zeros / total_rows, 4)
        })
    df_stats = pd.DataFrame(stats).sort_values(["zero_rate", "nonzero"], ascending=[False, True])
    print(df_stats.head(30))
    print("\n-- zero_rate top (most zeros) --")
    print(df_stats.head(10))
    print("\n-- zero_rate bottom (most nonzero) --")
    print(df_stats.tail(10))

    print("\n--- Schema Coverage (leetify economy) ---")
    econ_keys = [
        'data.leetify_data.round_stat[].bron_equipment.',
        'data.leetify_data.round_stat[].player_t_score.',
        'data.leetify_data.round_stat[].player_ct_score.',
        'data.leetify_data.round_stat[].player_bron_crash.'
    ]
    for k in econ_keys:
        count = sum(1 for p in paths if k in p)
        print(f"{k} paths: {count}")

    print("\n--- Schema Summary Coverage (by path groups) ---")
    uncovered = [p for p in paths if not is_covered(p)]
    print(f"total paths: {len(paths)}")
    print(f"covered paths: {len(paths) - len(uncovered)}")
    print(f"uncovered paths: {len(uncovered)}")

    df_unc = pd.DataFrame({"path": uncovered})
    if len(df_unc) > 0:
        df_unc["group"] = df_unc["path"].apply(group_key)
        print("\n-- Uncovered groups (count) --")
        print(df_unc.groupby("group").size().sort_values(ascending=False))
        print("\n-- Uncovered examples (top 50) --")
        print(df_unc["path"].head(50).to_list())

    conn.close()

def watch_schema(schema_path, interval=1.0):
    last_db_mtime = 0
    last_schema_mtime = 0
    first = True
    while True:
        if not os.path.exists(db_path):
            print(f"db not found: {db_path}")
            time.sleep(interval)
            continue
        db_mtime = os.path.getmtime(db_path)
        schema_mtime = os.path.getmtime(schema_path) if os.path.exists(schema_path) else 0
        if first or db_mtime > last_db_mtime or schema_mtime > last_schema_mtime:
            conn = sqlite3.connect(db_path)
            refresh_schema_sql(conn, schema_path)
            print(f"\n[{time.strftime('%Y-%m-%d %H:%M:%S')}] schema.sql refreshed")
            print_schema(conn)
            conn.close()
            last_db_mtime = db_mtime
            last_schema_mtime = os.path.getmtime(schema_path) if os.path.exists(schema_path) else 0
            first = False
        time.sleep(interval)

if __name__ == "__main__":
    args = [a.lower() for a in sys.argv[1:]]
    if "dump_uncovered" in args or "uncovered" in args:
        dump_uncovered('database/original_json_schema/uncovered_features.csv')
    elif "watch_schema" in args or "watch" in args:
        try:
            watch_schema('database/L2/schema.sql')
        except KeyboardInterrupt:
            pass
    elif "schema" in args or "refresh_schema" in args:
        if not os.path.exists(db_path):
            print(f"db not found: {db_path}")
        else:
            conn = sqlite3.connect(db_path)
            if "refresh_schema" in args:
                refresh_schema_sql(conn, 'database/L2/schema.sql')
                print("schema.sql refreshed")
            print_schema(conn)
            conn.close()
    else:
        verify()