Files
yrtv/ETL/verify/verify_deep.py
2026-01-24 00:43:05 +08:00

82 lines
2.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import sqlite3
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', '{:.2f}'.format)
db_path = 'database/L2/L2_Main.sqlite'
def check_nulls_zeros():
conn = sqlite3.connect(db_path)
print("=== 1. Fact Match Players: 关键字段零值/空值检查 ===")
df_players = pd.read_sql("""
SELECT
kills, deaths, assists, adr, rating, rating2,
kast, awp_kills, flash_duration, jump_count,
elo_change
FROM fact_match_players
""", conn)
stats = []
for col in df_players.columns:
total = len(df_players)
nulls = df_players[col].isnull().sum()
zeros = (df_players[col] == 0).sum()
stats.append({
'Field': col,
'Total': total,
'Nulls': nulls,
'Null%': (nulls/total)*100,
'Zeros': zeros,
'Zero%': (zeros/total)*100
})
print(pd.DataFrame(stats))
print("\n=== 2. Fact Round Events (Kills): 击杀完整性检查 ===")
# 只检查 event_type = 'kill' 的记录
df_kills = pd.read_sql("""
SELECT
attacker_steam_id, victim_steam_id,
event_time, weapon,
attacker_pos_x, score_change_attacker
FROM fact_round_events
WHERE event_type = 'kill'
""", conn)
total_kills = len(df_kills)
missing_attacker = df_kills['attacker_steam_id'].isnull().sum() + (df_kills['attacker_steam_id'] == '').sum()
missing_victim = df_kills['victim_steam_id'].isnull().sum() + (df_kills['victim_steam_id'] == '').sum()
# 检查 attacker 和 victim 是否相同(自杀)
self_kills = (df_kills['attacker_steam_id'] == df_kills['victim_steam_id']).sum()
print(f"Total Kill Events: {total_kills}")
print(f"Missing Attacker: {missing_attacker} ({missing_attacker/total_kills*100:.2f}%)")
print(f"Missing Victim: {missing_victim} ({missing_victim/total_kills*100:.2f}%)")
print(f"Self Kills (Suicide?): {self_kills}")
print("\n=== 3. Fact Round Events: 坐标与评分覆盖率 ===")
# 坐标应该在 classic 比赛中有值leetify 比赛中可能为空
# 评分应该在 leetify 比赛中有值
df_events = pd.read_sql("""
SELECT
m.data_source_type,
COUNT(*) as total_events,
SUM(CASE WHEN e.attacker_pos_x IS NOT NULL AND e.attacker_pos_x != 0 THEN 1 ELSE 0 END) as has_pos,
SUM(CASE WHEN e.score_change_attacker IS NOT NULL AND e.score_change_attacker != 0 THEN 1 ELSE 0 END) as has_score
FROM fact_round_events e
JOIN fact_matches m ON e.match_id = m.match_id
WHERE e.event_type = 'kill'
GROUP BY m.data_source_type
""", conn)
print(df_events)
conn.close()
if __name__ == "__main__":
check_nulls_zeros()