Source code for results.win_statistics

import re
import sys
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from scipy.stats import chi2_contingency
import math

#How to run a example:
#python compare_games_statistics_FIXED.py batch_comparison/resource_game/resource_allocation_2013440.out  

[docs] def parse_negotiation_log_corrected(file_path): """ Parses a negotiation log file to extract game data with corrected statistical structure, ensuring one row per game. Args: file_path (str): Path to the negotiation log file. Returns: tuple: A tuple containing: - pd.DataFrame: DataFrame with parsed game data. - str: The detected game type (e.g., 'integrative_negotiation'). Raises: FileNotFoundError: If the specified file does not exist. ValueError: If the log file format is invalid or cannot be parsed. Example: >>> df, game_type = parse_negotiation_log_corrected("log_file.out") >>> print(game_type) 'integrative_negotiation' """ with open(file_path, 'r', encoding='utf-8') as f: log_text = f.read() # Detect game type if re.search(r'IT\s+Team.*Marketing\s+Team', log_text, re.IGNORECASE): game_type = 'integrative_negotiation' elif re.search(r'BUYER.*SELLER|company\s+car', log_text, re.IGNORECASE): game_type = 'company_car' elif re.search(r'Development\s+Team.*Marketing\s+Team|GPU|resource\s+allocation', log_text, re.IGNORECASE): game_type = 'resource_allocation' else: game_type = 'unknown' iteration_blocks = re.split(r'===\s*Iteration\s+(\d+)/\d+\s*===', log_text) data = [] for i in range(1, len(iteration_blocks), 2): if i + 1 >= len(iteration_blocks): break iteration_num = int(iteration_blocks[i]) block = iteration_blocks[i + 1] if not block.strip(): continue # Parse role assignments model_role_mapping = {} role_assignment_match = re.search(r'๐ŸŽฒ\s*\[ROLE ASSIGNMENT\]\s*(.*)', block, re.IGNORECASE) if role_assignment_match: assignment_text = role_assignment_match.group(1) individual_assignments = re.findall(r'(model_[abc])\s*=\s*(\w+)', assignment_text, re.IGNORECASE) for model, role in individual_assignments: model_role_mapping[model] = role.upper() if len(model_role_mapping) < 2: print(f"โš ๏ธ Warning: Could not find role assignments for iteration {iteration_num}") continue # Parse first mover - handle both integrative and company car patterns # Integrative: "๐Ÿ’ก Player model_x made proposal (#1/4):" # Company car: "๐Ÿ’ก Player model_x made offer โ‚ฌXX,XXX (proposal 1/4)" first_proposal_integrative = re.search(r'๐Ÿ’ก\s+Player\s+(model_[abc])\s+made\s+proposal\s+\(#1/4\)', block, re.IGNORECASE) first_proposal_company_car = re.search(r'๐Ÿ’ก\s+Player\s+(model_[abc])\s+made\s+offer.*\(proposal\s+1/4\)', block, re.IGNORECASE) if first_proposal_integrative: first_mover = first_proposal_integrative.group(1) elif first_proposal_company_car: first_mover = first_proposal_company_car.group(1) else: first_mover = 'unknown' # Parse winner winner_match = re.search(r'\[LLM\s+WINNER\].*?\(player\s+(model_[abc])\s+won\)', block, re.IGNORECASE) if winner_match: winner = winner_match.group(1) else: utility_debug = re.search( r'model_([abc])\s+utility\s*=\s*([-\d.]+),\s*model_([abc])\s+utility\s*=\s*([-\d.]+)', block, re.IGNORECASE ) if utility_debug: model1, util1, model2, util2 = utility_debug.groups() util1, util2 = float(util1), float(util2) if util1 > util2: winner = f'model_{model1}' elif util2 > util1: winner = f'model_{model2}' else: winner = 'tie' else: print(f"โš ๏ธ Warning: Could not determine winner for iteration {iteration_num}") continue # Create ONE ROW PER GAME (correct statistical approach) if winner != 'tie': winning_role = model_role_mapping.get(winner, 'unknown') first_mover_role = model_role_mapping.get(first_mover, 'unknown') data.append({ 'Iteration': iteration_num, 'Winning_Role': winning_role, # Which role won (IT/Marketing, etc.) 'First_Mover_Role': first_mover_role, # Which role went first 'Winner_Model': winner, # Which model won (for model comparison) 'First_Mover_Model': first_mover, # Which model went first 'Game_type': game_type, 'Model_Assignments': model_role_mapping # For reference }) df = pd.DataFrame(data) print(f"โœ… Parsed {len(df)} games with clear winners") return df, game_type
[docs] def analyze_role_bias_corrected(df, game_type): """ Analyzes role bias in negotiation games to determine if certain roles have inherent advantages. Args: df (pd.DataFrame): DataFrame containing parsed game data. game_type (str): The type of game (e.g., 'integrative_negotiation'). Returns: dict or None: A dictionary with statistical test results if analysis is possible, otherwise None. The dictionary includes: - 'chi2' (float): Chi-square test statistic. - 'p_value' (float): P-value of the test. - 'cohens_h' (float): Effect size (Cohen's h). - 'effect_size' (str): Interpretation of effect size. - 'significant' (bool): Whether the result is statistically significant. Example: >>> results = analyze_role_bias_corrected(df, "integrative_negotiation") >>> print(results['significant']) True """ print("\n## ๐Ÿ“ˆ ROLE BIAS ANALYSIS (CORRECTED)") print("="*60) print("Question: Do certain roles (IT vs Marketing, Buyer vs Seller) have inherent advantages?") if len(df) == 0: return "No data available" # Count wins by role - this is the CORRECT approach role_wins = df['Winning_Role'].value_counts() total_games = len(df) print(f"\n### Win Counts by Role:") for role, wins in role_wins.items(): win_rate = wins / total_games print(f"- {role}: {wins}/{total_games} games ({win_rate:.1%})") # Chi-square goodness of fit test # H0: Roles are equally likely to win (50/50 for 2 roles) if len(role_wins) == 2: expected_per_role = total_games / 2 observed = role_wins.values expected = [expected_per_role, expected_per_role] chi2, p_value = chi2_contingency([observed, expected])[:2] # Effect size (Cohen's h for proportion difference) p1, p2 = observed[0]/total_games, observed[1]/total_games cohens_h = 2 * (np.arcsin(np.sqrt(p1)) - np.arcsin(np.sqrt(p2))) significance = "**SIGNIFICANT ROLE BIAS**" if p_value < 0.05 else "No significant role bias" print(f"\n### Statistical Test:") print(f"- Chi-square test: ฯ‡ยฒ(1) = {chi2:.2f}, p = {p_value:.4f}") print(f"- Result: {significance}") print(f"- Effect size (Cohen's h): {abs(cohens_h):.3f}") if abs(cohens_h) < 0.2: effect_interp = "negligible" elif abs(cohens_h) < 0.5: effect_interp = "small" elif abs(cohens_h) < 0.8: effect_interp = "medium" else: effect_interp = "large" print(f"- Effect interpretation: {effect_interp} difference") return { 'chi2': chi2, 'p_value': p_value, 'cohens_h': cohens_h, 'effect_size': effect_interp, 'significant': p_value < 0.05 } else: print("Cannot perform chi-square test: Need exactly 2 roles") return None
[docs] def analyze_first_mover_bias_corrected(df): """ Analyzes first-mover bias in negotiation games to determine if going first provides an advantage. Args: df (pd.DataFrame): DataFrame containing parsed game data. Returns: dict: A dictionary with statistical test results, including: - 'chi2' (float): Chi-square test statistic. - 'p_value' (float): P-value of the test. - 'cohens_h' (float): Effect size (Cohen's h). - 'first_mover_win_rate' (float): Win rate of the first mover. - 'significant' (bool): Whether the result is statistically significant. Example: >>> results = analyze_first_mover_bias_corrected(df) >>> print(results['first_mover_win_rate']) 0.65 """ print("\n## ๐Ÿš€ FIRST-MOVER BIAS ANALYSIS (CORRECTED)") print("="*60) print("Question: Does going first provide an advantage?") # Create binary outcome: did the first mover win? df['first_mover_won'] = (df['Winner_Model'] == df['First_Mover_Model']).astype(int) first_mover_wins = df['first_mover_won'].sum() total_games = len(df) first_mover_win_rate = first_mover_wins / total_games print(f"\n### First-Mover Performance:") print(f"- First mover won: {first_mover_wins}/{total_games} games ({first_mover_win_rate:.1%})") print(f"- Second mover won: {total_games-first_mover_wins}/{total_games} games ({1-first_mover_win_rate:.1%})") # Chi-square goodness of fit test # H0: First mover wins 50% of the time observed = [first_mover_wins, total_games - first_mover_wins] expected = [total_games/2, total_games/2] chi2, p_value = chi2_contingency([observed, expected])[:2] # Effect size (Cohen's h) p_first = first_mover_win_rate p_expected = 0.5 cohens_h = 2 * (np.arcsin(np.sqrt(p_first)) - np.arcsin(np.sqrt(p_expected))) significance = "**SIGNIFICANT FIRST-MOVER ADVANTAGE**" if p_value < 0.05 else "No significant first-mover advantage" print(f"\n### Statistical Test:") print(f"- Chi-square test: ฯ‡ยฒ(1) = {chi2:.2f}, p = {p_value:.4f}") print(f"- Result: {significance}") print(f"- Effect size (Cohen's h): {abs(cohens_h):.3f}") return { 'chi2': chi2, 'p_value': p_value, 'cohens_h': cohens_h, 'first_mover_win_rate': first_mover_win_rate, 'significant': p_value < 0.05 }
[docs] def bias_adjusted_model_comparison(df, role_bias_significant=False, first_mover_bias_significant=False): """ Compares model performances while controlling for role and first-mover biases. Args: df (pd.DataFrame): DataFrame containing parsed game data. role_bias_significant (bool): Whether role bias was detected as significant. first_mover_bias_significant (bool): Whether first-mover bias was detected as significant. Returns: dict or None: A dictionary with bias-adjusted model comparison results if analysis is successful, otherwise None. The dictionary includes: - 'model_a' (str): Name of the first model. - 'model_b' (str): Name of the second model. - 'adjusted_prob_a' (float): Bias-adjusted win probability for model_a. - 'raw_prob_a' (float): Raw win probability for model_a. - 'adjustment' (float): Difference between adjusted and raw probabilities. Example: >>> results = bias_adjusted_model_comparison(df, True, False) >>> print(results['adjusted_prob_a']) 0.72 """ print("\n## ๐ŸŽฏ BIAS-ADJUSTED MODEL COMPARISON") print("="*80) if role_bias_significant or first_mover_bias_significant: print("Question: Which model performs better when controlling for detected biases?") print("Note: Bias correction is NECESSARY due to significant bias detection.") else: print("Question: Which model performs better? (No significant bias detected)") print("Note: Bias correction applied for completeness, but raw results should be similar.") # Get unique models all_models = set() for assignments in df['Model_Assignments']: all_models.update(assignments.keys()) if len(all_models) != 2: print(f"Expected 2 models, found {len(all_models)}: {all_models}") return None model_a, model_b = sorted(all_models) # Create model outcome: which model won (regardless of role) df['model_a_won'] = (df['Winner_Model'] == model_a).astype(int) # Logistic regression controlling for biases # This is the CORRECT approach: outcome = model performance, controls = biases try: formula = 'model_a_won ~ C(Winning_Role) + C(First_Mover_Model)' model = smf.logit(formula=formula, data=df).fit(disp=False) print(f"\n### Logistic Regression Model:") print(f"Formula: {model_a}_won ~ role_advantage + first_mover_advantage") print(model.summary()) # Extract bias-adjusted win probability for model_a intercept = model.params['Intercept'] adjusted_log_odds = intercept # Baseline probability with biases controlled adjusted_prob_a = 1 / (1 + np.exp(-adjusted_log_odds)) adjusted_prob_b = 1 - adjusted_prob_a print(f"\n### ๐Ÿ† BIAS-ADJUSTED MODEL COMPARISON:") print(f"- {model_a}: {adjusted_prob_a:.3f} ({adjusted_prob_a*100:.1f}%) win probability") print(f"- {model_b}: {adjusted_prob_b:.3f} ({adjusted_prob_b*100:.1f}%) win probability") print("(These probabilities are adjusted for role bias and first-mover advantage)") # Raw comparison for reference raw_wins_a = df['model_a_won'].sum() raw_prob_a = raw_wins_a / len(df) print(f"\n### ๐Ÿ“Š RAW vs ADJUSTED COMPARISON:") print(f"- {model_a} raw win rate: {raw_prob_a:.3f} ({raw_prob_a*100:.1f}%)") print(f"- {model_a} bias-adjusted: {adjusted_prob_a:.3f} ({adjusted_prob_a*100:.1f}%)") adjustment = adjusted_prob_a - raw_prob_a direction = "higher" if adjustment > 0 else "lower" print(f"- Adjustment: {adjustment:+.3f} ({direction} due to bias correction)") return { 'model_a': model_a, 'model_b': model_b, 'adjusted_prob_a': adjusted_prob_a, 'raw_prob_a': raw_prob_a, 'adjustment': adjustment } except Exception as e: print(f"Could not fit logistic regression: {e}") return None
[docs] def main(): """ Main function to perform corrected bias analysis on a negotiation log file. Usage: python compare_games_statistics_FIXED.py <log_file.out> Args: None (command-line arguments are used). Returns: None: Outputs results to the console and exports corrected data to a CSV file. Example: $ python compare_games_statistics_FIXED.py integrative_negotiation_1975553.out """ if len(sys.argv) < 2: print("Usage: python compare_games_statistics_FIXED.py <log_file.out>") sys.exit(1) file_path = sys.argv[1] print(f"\n{'='*80}") print(f"๐Ÿ“‚ CORRECTED BIAS ANALYSIS: {file_path}") print(f"{'='*80}") # Parse with correct structure df, game_type = parse_negotiation_log_corrected(file_path) if df.empty: print("โŒ ERROR: No data could be parsed from the log file.") return print(f"\n๐Ÿ“Š Dataset Summary:") print(f"- Game type: {game_type.upper()}") print(f"- Total games analyzed: {len(df)}") print(f"- Roles: {df['Winning_Role'].unique()}") # Analyze biases with correct methods role_results = analyze_role_bias_corrected(df, game_type) first_mover_results = analyze_first_mover_bias_corrected(df) # Check if any bias was significant role_bias_significant = role_results and role_results.get('significant', False) first_mover_bias_significant = first_mover_results and first_mover_results.get('significant', False) # Model comparison with bias adjustment model_results = bias_adjusted_model_comparison(df, role_bias_significant, first_mover_bias_significant) print(f"\n{'='*80}") print("## ๐Ÿ“‹ SUMMARY OF CORRECTED ANALYSIS") print(f"{'='*80}") print("### ๐Ÿงช BIAS DETECTION RESULTS:") if role_results and role_results.get('significant'): print(f"โœ… **Role bias detected**: {role_results['effect_size']} effect (p = {role_results['p_value']:.4f})") else: print("โœ… **No significant role bias detected**") if first_mover_results and first_mover_results.get('significant'): print(f"โœ… **First-mover advantage detected**: {first_mover_results['first_mover_win_rate']:.1%} win rate (p = {first_mover_results['p_value']:.4f})") else: print("โœ… **No significant first-mover advantage detected**") print("\n### ๐Ÿ† FINAL MODEL PERFORMANCE RANKING (BIAS-ADJUSTED):") if model_results: model_a = model_results['model_a'] model_b = model_results['model_b'] prob_a = model_results['adjusted_prob_a'] prob_b = 1 - prob_a # Determine winner if prob_a > 0.5: winner = model_a winner_prob = prob_a loser = model_b loser_prob = prob_b else: winner = model_b winner_prob = prob_b loser = model_a loser_prob = prob_a print(f"๐Ÿฅ‡ **WINNER: {winner}**") print(f" - Bias-adjusted win probability: {winner_prob:.3f} ({winner_prob*100:.1f}%)") print(f"๐Ÿฅˆ **Second place: {loser}**") print(f" - Bias-adjusted win probability: {loser_prob:.3f} ({loser_prob*100:.1f}%)") # Show margin of victory margin = abs(winner_prob - loser_prob) if margin > 0.2: dominance = "Strong advantage" elif margin > 0.1: dominance = "Moderate advantage" elif margin > 0.05: dominance = "Slight advantage" else: dominance = "Essentially tied" print(f" - Margin: {margin:.3f} ({dominance})") # Show if bias correction mattered - CORRECTED LOGIC raw_prob_a = model_results['raw_prob_a'] adjustment = model_results['adjustment'] # Check if EITHER bias was statistically significant role_bias_significant = role_results and role_results.get('significant', False) first_mover_bias_significant = first_mover_results and first_mover_results.get('significant', False) any_bias_detected = role_bias_significant or first_mover_bias_significant if any_bias_detected: print(f"\nโš ๏ธ **Bias Correction Applied**: Statistically significant bias was detected and corrected.") print(f" - Adjustment magnitude: {adjustment:+.3f} (difference in win probability)") if abs(adjustment) > 0.05: print(f" - Impact: Large correction - raw win rates would have been misleading") elif abs(adjustment) > 0.02: print(f" - Impact: Moderate correction applied") else: print(f" - Impact: Small correction (bias was significant but effect was limited)") else: print(f"\nโœ… **No Bias Correction Needed**: No statistically significant bias detected.") print(f" - Raw win rates are reliable (adjustment: {adjustment:+.3f})") if abs(adjustment) > 0.05: print(f" - Note: Large numerical difference due to random variation, not systematic bias") print(f"\n๐Ÿ“Š **Statistical Details**:") print(f" - Analysis method: Multiple logistic regression") print(f" - Controls: Role bias + First-mover advantage") print(f" - Sample size: {len(df)} independent games") print(f" - Each game counted once (no double-counting)") # Final interpretation based on bias detection print(f"\n๐ŸŽฏ **INTERPRETATION**:") if any_bias_detected: print(f" The bias-adjusted results are MORE RELIABLE than raw win rates") print(f" because significant bias was detected and corrected.") else: print(f" The bias-adjusted and raw results should be similar") print(f" because no significant bias was detected.") print(f" Either result can be trusted for model comparison.") else: print("โŒ **Could not determine model performance** (insufficient data or regression failed)") # Export data output_csv = file_path.replace('.out', '_corrected_analysis.csv') df.to_csv(output_csv, index=False) print(f"\nโœ… Corrected data exported to '{output_csv}'")
if __name__ == "__main__": main()