import re
import sys
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from scipy.stats import chi2_contingency
import math
#How to run a example:
#python compare_games_statistics_FIXED.py batch_comparison/resource_game/resource_allocation_2013440.out
[docs]
def parse_negotiation_log_corrected(file_path):
"""
Parses a negotiation log file to extract game data with corrected statistical
structure, ensuring one row per game.
Args:
file_path (str): Path to the negotiation log file.
Returns:
tuple: A tuple containing:
- pd.DataFrame: DataFrame with parsed game data.
- str: The detected game type (e.g., 'integrative_negotiation').
Raises:
FileNotFoundError: If the specified file does not exist.
ValueError: If the log file format is invalid or cannot be parsed.
Example:
>>> df, game_type = parse_negotiation_log_corrected("log_file.out")
>>> print(game_type)
'integrative_negotiation'
"""
with open(file_path, 'r', encoding='utf-8') as f:
log_text = f.read()
# Detect game type
if re.search(r'IT\s+Team.*Marketing\s+Team', log_text, re.IGNORECASE):
game_type = 'integrative_negotiation'
elif re.search(r'BUYER.*SELLER|company\s+car', log_text, re.IGNORECASE):
game_type = 'company_car'
elif re.search(r'Development\s+Team.*Marketing\s+Team|GPU|resource\s+allocation', log_text, re.IGNORECASE):
game_type = 'resource_allocation'
else:
game_type = 'unknown'
iteration_blocks = re.split(r'===\s*Iteration\s+(\d+)/\d+\s*===', log_text)
data = []
for i in range(1, len(iteration_blocks), 2):
if i + 1 >= len(iteration_blocks):
break
iteration_num = int(iteration_blocks[i])
block = iteration_blocks[i + 1]
if not block.strip():
continue
# Parse role assignments
model_role_mapping = {}
role_assignment_match = re.search(r'๐ฒ\s*\[ROLE ASSIGNMENT\]\s*(.*)', block, re.IGNORECASE)
if role_assignment_match:
assignment_text = role_assignment_match.group(1)
individual_assignments = re.findall(r'(model_[abc])\s*=\s*(\w+)', assignment_text, re.IGNORECASE)
for model, role in individual_assignments:
model_role_mapping[model] = role.upper()
if len(model_role_mapping) < 2:
print(f"โ ๏ธ Warning: Could not find role assignments for iteration {iteration_num}")
continue
# Parse first mover - handle both integrative and company car patterns
# Integrative: "๐ก Player model_x made proposal (#1/4):"
# Company car: "๐ก Player model_x made offer โฌXX,XXX (proposal 1/4)"
first_proposal_integrative = re.search(r'๐ก\s+Player\s+(model_[abc])\s+made\s+proposal\s+\(#1/4\)', block, re.IGNORECASE)
first_proposal_company_car = re.search(r'๐ก\s+Player\s+(model_[abc])\s+made\s+offer.*\(proposal\s+1/4\)', block, re.IGNORECASE)
if first_proposal_integrative:
first_mover = first_proposal_integrative.group(1)
elif first_proposal_company_car:
first_mover = first_proposal_company_car.group(1)
else:
first_mover = 'unknown'
# Parse winner
winner_match = re.search(r'\[LLM\s+WINNER\].*?\(player\s+(model_[abc])\s+won\)', block, re.IGNORECASE)
if winner_match:
winner = winner_match.group(1)
else:
utility_debug = re.search(
r'model_([abc])\s+utility\s*=\s*([-\d.]+),\s*model_([abc])\s+utility\s*=\s*([-\d.]+)',
block, re.IGNORECASE
)
if utility_debug:
model1, util1, model2, util2 = utility_debug.groups()
util1, util2 = float(util1), float(util2)
if util1 > util2:
winner = f'model_{model1}'
elif util2 > util1:
winner = f'model_{model2}'
else:
winner = 'tie'
else:
print(f"โ ๏ธ Warning: Could not determine winner for iteration {iteration_num}")
continue
# Create ONE ROW PER GAME (correct statistical approach)
if winner != 'tie':
winning_role = model_role_mapping.get(winner, 'unknown')
first_mover_role = model_role_mapping.get(first_mover, 'unknown')
data.append({
'Iteration': iteration_num,
'Winning_Role': winning_role, # Which role won (IT/Marketing, etc.)
'First_Mover_Role': first_mover_role, # Which role went first
'Winner_Model': winner, # Which model won (for model comparison)
'First_Mover_Model': first_mover, # Which model went first
'Game_type': game_type,
'Model_Assignments': model_role_mapping # For reference
})
df = pd.DataFrame(data)
print(f"โ
Parsed {len(df)} games with clear winners")
return df, game_type
[docs]
def analyze_role_bias_corrected(df, game_type):
"""
Analyzes role bias in negotiation games to determine if certain roles
have inherent advantages.
Args:
df (pd.DataFrame): DataFrame containing parsed game data.
game_type (str): The type of game (e.g., 'integrative_negotiation').
Returns:
dict or None: A dictionary with statistical test results if analysis
is possible, otherwise None. The dictionary includes:
- 'chi2' (float): Chi-square test statistic.
- 'p_value' (float): P-value of the test.
- 'cohens_h' (float): Effect size (Cohen's h).
- 'effect_size' (str): Interpretation of effect size.
- 'significant' (bool): Whether the result is statistically significant.
Example:
>>> results = analyze_role_bias_corrected(df, "integrative_negotiation")
>>> print(results['significant'])
True
"""
print("\n## ๐ ROLE BIAS ANALYSIS (CORRECTED)")
print("="*60)
print("Question: Do certain roles (IT vs Marketing, Buyer vs Seller) have inherent advantages?")
if len(df) == 0:
return "No data available"
# Count wins by role - this is the CORRECT approach
role_wins = df['Winning_Role'].value_counts()
total_games = len(df)
print(f"\n### Win Counts by Role:")
for role, wins in role_wins.items():
win_rate = wins / total_games
print(f"- {role}: {wins}/{total_games} games ({win_rate:.1%})")
# Chi-square goodness of fit test
# H0: Roles are equally likely to win (50/50 for 2 roles)
if len(role_wins) == 2:
expected_per_role = total_games / 2
observed = role_wins.values
expected = [expected_per_role, expected_per_role]
chi2, p_value = chi2_contingency([observed, expected])[:2]
# Effect size (Cohen's h for proportion difference)
p1, p2 = observed[0]/total_games, observed[1]/total_games
cohens_h = 2 * (np.arcsin(np.sqrt(p1)) - np.arcsin(np.sqrt(p2)))
significance = "**SIGNIFICANT ROLE BIAS**" if p_value < 0.05 else "No significant role bias"
print(f"\n### Statistical Test:")
print(f"- Chi-square test: ฯยฒ(1) = {chi2:.2f}, p = {p_value:.4f}")
print(f"- Result: {significance}")
print(f"- Effect size (Cohen's h): {abs(cohens_h):.3f}")
if abs(cohens_h) < 0.2:
effect_interp = "negligible"
elif abs(cohens_h) < 0.5:
effect_interp = "small"
elif abs(cohens_h) < 0.8:
effect_interp = "medium"
else:
effect_interp = "large"
print(f"- Effect interpretation: {effect_interp} difference")
return {
'chi2': chi2,
'p_value': p_value,
'cohens_h': cohens_h,
'effect_size': effect_interp,
'significant': p_value < 0.05
}
else:
print("Cannot perform chi-square test: Need exactly 2 roles")
return None
[docs]
def analyze_first_mover_bias_corrected(df):
"""
Analyzes first-mover bias in negotiation games to determine if going first
provides an advantage.
Args:
df (pd.DataFrame): DataFrame containing parsed game data.
Returns:
dict: A dictionary with statistical test results, including:
- 'chi2' (float): Chi-square test statistic.
- 'p_value' (float): P-value of the test.
- 'cohens_h' (float): Effect size (Cohen's h).
- 'first_mover_win_rate' (float): Win rate of the first mover.
- 'significant' (bool): Whether the result is statistically significant.
Example:
>>> results = analyze_first_mover_bias_corrected(df)
>>> print(results['first_mover_win_rate'])
0.65
"""
print("\n## ๐ FIRST-MOVER BIAS ANALYSIS (CORRECTED)")
print("="*60)
print("Question: Does going first provide an advantage?")
# Create binary outcome: did the first mover win?
df['first_mover_won'] = (df['Winner_Model'] == df['First_Mover_Model']).astype(int)
first_mover_wins = df['first_mover_won'].sum()
total_games = len(df)
first_mover_win_rate = first_mover_wins / total_games
print(f"\n### First-Mover Performance:")
print(f"- First mover won: {first_mover_wins}/{total_games} games ({first_mover_win_rate:.1%})")
print(f"- Second mover won: {total_games-first_mover_wins}/{total_games} games ({1-first_mover_win_rate:.1%})")
# Chi-square goodness of fit test
# H0: First mover wins 50% of the time
observed = [first_mover_wins, total_games - first_mover_wins]
expected = [total_games/2, total_games/2]
chi2, p_value = chi2_contingency([observed, expected])[:2]
# Effect size (Cohen's h)
p_first = first_mover_win_rate
p_expected = 0.5
cohens_h = 2 * (np.arcsin(np.sqrt(p_first)) - np.arcsin(np.sqrt(p_expected)))
significance = "**SIGNIFICANT FIRST-MOVER ADVANTAGE**" if p_value < 0.05 else "No significant first-mover advantage"
print(f"\n### Statistical Test:")
print(f"- Chi-square test: ฯยฒ(1) = {chi2:.2f}, p = {p_value:.4f}")
print(f"- Result: {significance}")
print(f"- Effect size (Cohen's h): {abs(cohens_h):.3f}")
return {
'chi2': chi2,
'p_value': p_value,
'cohens_h': cohens_h,
'first_mover_win_rate': first_mover_win_rate,
'significant': p_value < 0.05
}
[docs]
def bias_adjusted_model_comparison(df, role_bias_significant=False, first_mover_bias_significant=False):
"""
Compares model performances while controlling for role and first-mover biases.
Args:
df (pd.DataFrame): DataFrame containing parsed game data.
role_bias_significant (bool): Whether role bias was detected as significant.
first_mover_bias_significant (bool): Whether first-mover bias was detected as significant.
Returns:
dict or None: A dictionary with bias-adjusted model comparison results if
analysis is successful, otherwise None. The dictionary includes:
- 'model_a' (str): Name of the first model.
- 'model_b' (str): Name of the second model.
- 'adjusted_prob_a' (float): Bias-adjusted win probability for model_a.
- 'raw_prob_a' (float): Raw win probability for model_a.
- 'adjustment' (float): Difference between adjusted and raw probabilities.
Example:
>>> results = bias_adjusted_model_comparison(df, True, False)
>>> print(results['adjusted_prob_a'])
0.72
"""
print("\n## ๐ฏ BIAS-ADJUSTED MODEL COMPARISON")
print("="*80)
if role_bias_significant or first_mover_bias_significant:
print("Question: Which model performs better when controlling for detected biases?")
print("Note: Bias correction is NECESSARY due to significant bias detection.")
else:
print("Question: Which model performs better? (No significant bias detected)")
print("Note: Bias correction applied for completeness, but raw results should be similar.")
# Get unique models
all_models = set()
for assignments in df['Model_Assignments']:
all_models.update(assignments.keys())
if len(all_models) != 2:
print(f"Expected 2 models, found {len(all_models)}: {all_models}")
return None
model_a, model_b = sorted(all_models)
# Create model outcome: which model won (regardless of role)
df['model_a_won'] = (df['Winner_Model'] == model_a).astype(int)
# Logistic regression controlling for biases
# This is the CORRECT approach: outcome = model performance, controls = biases
try:
formula = 'model_a_won ~ C(Winning_Role) + C(First_Mover_Model)'
model = smf.logit(formula=formula, data=df).fit(disp=False)
print(f"\n### Logistic Regression Model:")
print(f"Formula: {model_a}_won ~ role_advantage + first_mover_advantage")
print(model.summary())
# Extract bias-adjusted win probability for model_a
intercept = model.params['Intercept']
adjusted_log_odds = intercept # Baseline probability with biases controlled
adjusted_prob_a = 1 / (1 + np.exp(-adjusted_log_odds))
adjusted_prob_b = 1 - adjusted_prob_a
print(f"\n### ๐ BIAS-ADJUSTED MODEL COMPARISON:")
print(f"- {model_a}: {adjusted_prob_a:.3f} ({adjusted_prob_a*100:.1f}%) win probability")
print(f"- {model_b}: {adjusted_prob_b:.3f} ({adjusted_prob_b*100:.1f}%) win probability")
print("(These probabilities are adjusted for role bias and first-mover advantage)")
# Raw comparison for reference
raw_wins_a = df['model_a_won'].sum()
raw_prob_a = raw_wins_a / len(df)
print(f"\n### ๐ RAW vs ADJUSTED COMPARISON:")
print(f"- {model_a} raw win rate: {raw_prob_a:.3f} ({raw_prob_a*100:.1f}%)")
print(f"- {model_a} bias-adjusted: {adjusted_prob_a:.3f} ({adjusted_prob_a*100:.1f}%)")
adjustment = adjusted_prob_a - raw_prob_a
direction = "higher" if adjustment > 0 else "lower"
print(f"- Adjustment: {adjustment:+.3f} ({direction} due to bias correction)")
return {
'model_a': model_a,
'model_b': model_b,
'adjusted_prob_a': adjusted_prob_a,
'raw_prob_a': raw_prob_a,
'adjustment': adjustment
}
except Exception as e:
print(f"Could not fit logistic regression: {e}")
return None
[docs]
def main():
"""
Main function to perform corrected bias analysis on a negotiation log file.
Usage:
python compare_games_statistics_FIXED.py <log_file.out>
Args:
None (command-line arguments are used).
Returns:
None: Outputs results to the console and exports corrected data to a CSV file.
Example:
$ python compare_games_statistics_FIXED.py integrative_negotiation_1975553.out
"""
if len(sys.argv) < 2:
print("Usage: python compare_games_statistics_FIXED.py <log_file.out>")
sys.exit(1)
file_path = sys.argv[1]
print(f"\n{'='*80}")
print(f"๐ CORRECTED BIAS ANALYSIS: {file_path}")
print(f"{'='*80}")
# Parse with correct structure
df, game_type = parse_negotiation_log_corrected(file_path)
if df.empty:
print("โ ERROR: No data could be parsed from the log file.")
return
print(f"\n๐ Dataset Summary:")
print(f"- Game type: {game_type.upper()}")
print(f"- Total games analyzed: {len(df)}")
print(f"- Roles: {df['Winning_Role'].unique()}")
# Analyze biases with correct methods
role_results = analyze_role_bias_corrected(df, game_type)
first_mover_results = analyze_first_mover_bias_corrected(df)
# Check if any bias was significant
role_bias_significant = role_results and role_results.get('significant', False)
first_mover_bias_significant = first_mover_results and first_mover_results.get('significant', False)
# Model comparison with bias adjustment
model_results = bias_adjusted_model_comparison(df, role_bias_significant, first_mover_bias_significant)
print(f"\n{'='*80}")
print("## ๐ SUMMARY OF CORRECTED ANALYSIS")
print(f"{'='*80}")
print("### ๐งช BIAS DETECTION RESULTS:")
if role_results and role_results.get('significant'):
print(f"โ
**Role bias detected**: {role_results['effect_size']} effect (p = {role_results['p_value']:.4f})")
else:
print("โ
**No significant role bias detected**")
if first_mover_results and first_mover_results.get('significant'):
print(f"โ
**First-mover advantage detected**: {first_mover_results['first_mover_win_rate']:.1%} win rate (p = {first_mover_results['p_value']:.4f})")
else:
print("โ
**No significant first-mover advantage detected**")
print("\n### ๐ FINAL MODEL PERFORMANCE RANKING (BIAS-ADJUSTED):")
if model_results:
model_a = model_results['model_a']
model_b = model_results['model_b']
prob_a = model_results['adjusted_prob_a']
prob_b = 1 - prob_a
# Determine winner
if prob_a > 0.5:
winner = model_a
winner_prob = prob_a
loser = model_b
loser_prob = prob_b
else:
winner = model_b
winner_prob = prob_b
loser = model_a
loser_prob = prob_a
print(f"๐ฅ **WINNER: {winner}**")
print(f" - Bias-adjusted win probability: {winner_prob:.3f} ({winner_prob*100:.1f}%)")
print(f"๐ฅ **Second place: {loser}**")
print(f" - Bias-adjusted win probability: {loser_prob:.3f} ({loser_prob*100:.1f}%)")
# Show margin of victory
margin = abs(winner_prob - loser_prob)
if margin > 0.2:
dominance = "Strong advantage"
elif margin > 0.1:
dominance = "Moderate advantage"
elif margin > 0.05:
dominance = "Slight advantage"
else:
dominance = "Essentially tied"
print(f" - Margin: {margin:.3f} ({dominance})")
# Show if bias correction mattered - CORRECTED LOGIC
raw_prob_a = model_results['raw_prob_a']
adjustment = model_results['adjustment']
# Check if EITHER bias was statistically significant
role_bias_significant = role_results and role_results.get('significant', False)
first_mover_bias_significant = first_mover_results and first_mover_results.get('significant', False)
any_bias_detected = role_bias_significant or first_mover_bias_significant
if any_bias_detected:
print(f"\nโ ๏ธ **Bias Correction Applied**: Statistically significant bias was detected and corrected.")
print(f" - Adjustment magnitude: {adjustment:+.3f} (difference in win probability)")
if abs(adjustment) > 0.05:
print(f" - Impact: Large correction - raw win rates would have been misleading")
elif abs(adjustment) > 0.02:
print(f" - Impact: Moderate correction applied")
else:
print(f" - Impact: Small correction (bias was significant but effect was limited)")
else:
print(f"\nโ
**No Bias Correction Needed**: No statistically significant bias detected.")
print(f" - Raw win rates are reliable (adjustment: {adjustment:+.3f})")
if abs(adjustment) > 0.05:
print(f" - Note: Large numerical difference due to random variation, not systematic bias")
print(f"\n๐ **Statistical Details**:")
print(f" - Analysis method: Multiple logistic regression")
print(f" - Controls: Role bias + First-mover advantage")
print(f" - Sample size: {len(df)} independent games")
print(f" - Each game counted once (no double-counting)")
# Final interpretation based on bias detection
print(f"\n๐ฏ **INTERPRETATION**:")
if any_bias_detected:
print(f" The bias-adjusted results are MORE RELIABLE than raw win rates")
print(f" because significant bias was detected and corrected.")
else:
print(f" The bias-adjusted and raw results should be similar")
print(f" because no significant bias was detected.")
print(f" Either result can be trusted for model comparison.")
else:
print("โ **Could not determine model performance** (insufficient data or regression failed)")
# Export data
output_csv = file_path.replace('.out', '_corrected_analysis.csv')
df.to_csv(output_csv, index=False)
print(f"\nโ
Corrected data exported to '{output_csv}'")
if __name__ == "__main__":
main()