1.4.0: Updated Profile

2026-01-27 16:51:53 +08:00
parent 28dc02c0c4
commit 0be68a86f6
7 changed files with 342 additions and 81 deletions
--- a/web/services/feature_service.py
+++ b/web/services/feature_service.py
@@ -978,6 +978,163 @@ class FeatureService:
                df['util_usage_rate'] = df['util_usage_rate_backup'].fillna(0)
                df.drop(columns=['util_usage_rate_backup'], inplace=True)

+        # --- 8. New Feature Dimensions (Party, Rating Dist, ELO) ---
+        # Fetch Base Data for Calculation
+        q_new_feats = f"""
+        SELECT mp.steam_id_64, mp.match_id, mp.match_team_id, mp.team_id, 
+               mp.rating, mp.adr, mp.is_win
+        FROM fact_match_players mp
+        WHERE mp.steam_id_64 IN ({placeholders})
+        """
+        df_base = pd.read_sql_query(q_new_feats, conn, params=valid_ids)
+        
+        if not df_base.empty:
+            # 8.1 Party Size Stats
+            # Get party sizes for these matches
+            # We need to query party sizes for ALL matches involved
+            match_ids = df_base['match_id'].unique()
+            if len(match_ids) > 0:
+                match_id_ph = ','.join(['?'] * len(match_ids))
+                q_party_size = f"""
+                SELECT match_id, match_team_id, COUNT(*) as party_size
+                FROM fact_match_players
+                WHERE match_id IN ({match_id_ph}) AND match_team_id > 0
+                GROUP BY match_id, match_team_id
+                """
+                # Split match_ids into chunks if too many
+                chunk_size = 900
+                party_sizes_list = []
+                for i in range(0, len(match_ids), chunk_size):
+                    chunk = match_ids[i:i+chunk_size]
+                    chunk_ph = ','.join(['?'] * len(chunk))
+                    q_chunk = q_party_size.replace(match_id_ph, chunk_ph)
+                    party_sizes_list.append(pd.read_sql_query(q_chunk, conn, params=list(chunk)))
+                
+                if party_sizes_list:
+                    df_party_sizes = pd.concat(party_sizes_list)
+                    
+                    # Merge party size to base data
+                    df_base_party = df_base.merge(df_party_sizes, on=['match_id', 'match_team_id'], how='left')
+                    
+                    # Calculate Stats per Party Size (1-5)
+                    # We want columns like party_1_win_rate, party_1_rating, party_1_adr
+                    party_stats = df_base_party.groupby(['steam_id_64', 'party_size']).agg({
+                        'is_win': 'mean',
+                        'rating': 'mean',
+                        'adr': 'mean'
+                    }).reset_index()
+                    
+                    # Pivot
+                    pivoted_party = party_stats.pivot(index='steam_id_64', columns='party_size').reset_index()
+                    
+                    # Flatten and rename
+                    new_party_cols = ['steam_id_64']
+                    for col in pivoted_party.columns:
+                        if col[0] == 'steam_id_64': continue
+                        metric, size = col
+                        if size in [1, 2, 3, 4, 5]:
+                            # metric is is_win, rating, adr
+                            metric_name = 'win_rate' if metric == 'is_win' else metric
+                            new_party_cols.append(f"party_{int(size)}_{metric_name}")
+                    
+                    # Handle MultiIndex column flattening properly
+                    # The pivot creates MultiIndex. We need to construct a flat DataFrame.
+                    flat_data = {'steam_id_64': pivoted_party['steam_id_64']}
+                    for size in [1, 2, 3, 4, 5]:
+                        if size in pivoted_party['is_win'].columns:
+                            flat_data[f"party_{size}_win_rate"] = pivoted_party['is_win'][size]
+                        if size in pivoted_party['rating'].columns:
+                            flat_data[f"party_{size}_rating"] = pivoted_party['rating'][size]
+                        if size in pivoted_party['adr'].columns:
+                            flat_data[f"party_{size}_adr"] = pivoted_party['adr'][size]
+                    
+                    df_party_flat = pd.DataFrame(flat_data)
+                    df = df.merge(df_party_flat, on='steam_id_64', how='left')
+
+            # 8.2 Rating Distribution
+            # rating_dist_carry_rate (>1.5), normal (1.0-1.5), sacrifice (0.6-1.0), sleeping (<0.6)
+            df_base['rating_tier'] = pd.cut(df_base['rating'], 
+                                          bins=[-1, 0.6, 1.0, 1.5, 100], 
+                                          labels=['sleeping', 'sacrifice', 'normal', 'carry'],
+                                          right=False) # <0.6, 0.6-<1.0, 1.0-<1.5, >=1.5 (wait, cut behavior)
+            # Standard cut: right=True by default (a, b]. We want:
+            # < 0.6
+            # 0.6 <= x < 1.0
+            # 1.0 <= x < 1.5
+            # >= 1.5
+            # So bins=[-inf, 0.6, 1.0, 1.5, inf], right=False -> [a, b)
+            df_base['rating_tier'] = pd.cut(df_base['rating'], 
+                                          bins=[-float('inf'), 0.6, 1.0, 1.5, float('inf')], 
+                                          labels=['sleeping', 'sacrifice', 'normal', 'carry'],
+                                          right=False)
+
+            # Wait, 1.5 should be Normal or Carry?
+            # User: >1.5 Carry, 1.0~1.5 Normal. So 1.5 is Normal? Or Carry?
+            # Usually inclusive on lower bound.
+            # 1.5 -> Carry (>1.5 usually means >= 1.5 or strictly >).
+            # "1.0~1.5 正常" implies [1.0, 1.5]. ">1.5 Carry" implies (1.5, inf).
+            # Let's assume >= 1.5 is Carry.
+            # So bins: (-inf, 0.6), [0.6, 1.0), [1.0, 1.5), [1.5, inf)
+            # right=False gives [a, b).
+            # So [1.5, inf) is correct for Carry.
+            
+            dist_stats = df_base.groupby(['steam_id_64', 'rating_tier']).size().unstack(fill_value=0)
+            # Calculate rates
+            dist_stats = dist_stats.div(dist_stats.sum(axis=1), axis=0)
+            dist_stats.columns = [f"rating_dist_{c}_rate" for c in dist_stats.columns]
+            dist_stats = dist_stats.reset_index()
+            
+            df = df.merge(dist_stats, on='steam_id_64', how='left')
+
+            # 8.3 ELO Stratification
+            # Fetch Match Teams ELO
+            if len(match_ids) > 0:
+                q_elo = f"""
+                SELECT match_id, group_id, group_origin_elo
+                FROM fact_match_teams
+                WHERE match_id IN ({match_id_ph})
+                """
+                # Use chunking again
+                elo_list = []
+                for i in range(0, len(match_ids), chunk_size):
+                    chunk = match_ids[i:i+chunk_size]
+                    chunk_ph = ','.join(['?'] * len(chunk))
+                    q_chunk = q_elo.replace(match_id_ph, chunk_ph)
+                    elo_list.append(pd.read_sql_query(q_chunk, conn, params=list(chunk)))
+                
+                if elo_list:
+                    df_elo_teams = pd.concat(elo_list)
+                    
+                    # Merge to get Opponent ELO
+                    # Player has match_id, team_id.
+                    # Join on match_id.
+                    # Filter where group_id != team_id
+                    df_merged_elo = df_base.merge(df_elo_teams, on='match_id', how='left')
+                    df_merged_elo = df_merged_elo[df_merged_elo['group_id'] != df_merged_elo['team_id']]
+                    
+                    # Now df_merged_elo has 'group_origin_elo' which is Opponent ELO
+                    # Binning: <1200, 1200-1400, 1400-1600, 1600-1800, 1800-2000, >2000
+                    # bins: [-inf, 1200, 1400, 1600, 1800, 2000, inf]
+                    elo_bins = [-float('inf'), 1200, 1400, 1600, 1800, 2000, float('inf')]
+                    elo_labels = ['lt1200', '1200_1400', '1400_1600', '1600_1800', '1800_2000', 'gt2000']
+                    
+                    df_merged_elo['elo_bin'] = pd.cut(df_merged_elo['group_origin_elo'], bins=elo_bins, labels=elo_labels, right=False)
+                    
+                    elo_stats = df_merged_elo.groupby(['steam_id_64', 'elo_bin']).agg({
+                        'rating': 'mean'
+                    }).unstack(fill_value=0) # We only need rating for now
+                    
+                    # Rename columns
+                    # elo_stats columns are MultiIndex (rating, bin).
+                    # We want: elo_{bin}_rating
+                    flat_elo_data = {'steam_id_64': elo_stats.index}
+                    for bin_label in elo_labels:
+                        if bin_label in elo_stats['rating'].columns:
+                            flat_elo_data[f"elo_{bin_label}_rating"] = elo_stats['rating'][bin_label].values
+                    
+                    df_elo_flat = pd.DataFrame(flat_elo_data)
+                    df = df.merge(df_elo_flat, on='steam_id_64', how='left')
+
        # Final Mappings
        df['total_matches'] = df['matches_played']