import matplotlib matplotlib.use('WebAgg')
import pandas as pd import matplotlib.pyplot as plt import numpy as np import seaborn as sns import tornado
Replace 'your_file.csv' with the actual file path
data = pd.read_csv("C:\Users\S\Documents\Coding\Whack\Coventry city football\CCFC_match_lineups_data.csv", sep=",")
data.dropna(inplace=True) data.reset_index(inplace=True) data["minutes_played"] = np.nan
print(data)
minutesplayed = 0
for i in range(len(data)): lineup = data.lineup[i] lineup = lineup.split(sep=",") minutesplayed = 0
for j in range(len(lineup)):
if j == len(lineup) - 1:
#print(lineup[j][19:-2])
minutesplayed += float(lineup[j][19:-2])
elif (j+1)%4 == 0:
#print(lineup[j][19:-1])
minutesplayed += float(lineup[j][19:-1])
data.minutes_played[i] = minutesplayed
#print("minutes played total this game: " + str(minutesplayed))
data.drop(["Unnamed: 0", "Opposition", "match_id", "season_name", "team", "opposition_team", "possession", "final_third_possession","ppda", "match_outcome",\ "monte_carlo_win_prob", "monte_carlo_draw_prob", "monte_carlo_loss_prob", "location", "lineup" ], axis=1, inplace=True)
data["injury_risk"] = data["injury_risk"].shift(-1)
data.dropna(inplace=True)
data.reset_index(inplace=True)
print(pd.to_datetime(data["date"]))
dftemp = data[pd.to_datetime(data['date']).dt.month == 5]
for i in range(min(pd.to_datetime(dftemp["date"]).dt.year), max(pd.to_datetime(dftemp["date"]).dt.year)):
dftemp2 = dftemp[pd.to_datetime(dftemp["date"]).dt.year == i]
if(max(pd.to_datetime(dftemp2["date"]).dt.day) < 10):
#print(str(i) + "-05-0" + str(max(pd.to_datetime(dftemp2["date"]).dt.day)))
data.drop(data.index[data['date'] == str(i) + "-05-0" + str(max(pd.to_datetime(dftemp2["date"]).dt.day))], inplace = True)
else:
#print(str(i) + "-05-" + str(max(pd.to_datetime(dftemp2["date"]).dt.day)))
data.drop(data.index[data['date'] == str(i) + "-05-" + str(max(pd.to_datetime(dftemp2["date"]).dt.day))], inplace = True)
data["date"] = pd.to_datetime(data["date"]) - pd.to_timedelta(7, unit="d") counts = data.groupby([pd.Grouper(key='date', freq='W')])[["Distance"]].count()
data = data.groupby([pd.Grouper(key='date', freq='W')])[["Distance","HSR","Sprint","Accelerations","Decelerations","Jumps","goals_scored","goals_conceded","np_xg",\ "np_xg_conceded","shots","shots_on_target","opposition_shots","opposition_shots_on_target","passes",\ "opposition_passes","completed_passes_into_the_box","pressures","pressure_regains","fouls","tackles",\ "yellow_cards","xg_within_8_seconds_of_corner","xg_conceded_within_8_seconds_of_corner",\ "shots_within_8_seconds_of_corner","shots_conceded_within_8_seconds_of_corner","goals_within_8_seconds_of_corner",\ "goals_conceded_within_8_seconds_of_corner","xg_within_8_seconds_of_indirect_free_kick","xg_conceded_within_8_seconds_of_indirect_free_kick",\ "shots_within_8_seconds_of_indirect_free_kick","shots_conceded_within_8_seconds_of_indirect_free_kick","goals_within_8_seconds_of_indirect_free_kick",\ "goals_conceded_within_8_seconds_of_indirect_free_kick","completed_passes_and_carries_into_final_third","minutes_played","injury_risk"]].sum()
pd.set_option("display.max_rows", None)
for i in range(len(data)): if counts["Distance"][i] == 0: data["injury_risk"][i] = -1 else: data["injury_risk"][i] = data["injury_risk"][i] / counts["Distance"][i]
data["injury_risk"] = data["injury_risk"].shift(-1) data.dropna(inplace=True) data.reset_index(inplace=True)
data = data.drop(data[data['Distance'] == 0].index) data = data.drop(data[data['injury_risk'] == -1].index)
data = data.drop("date", axis=1)
print(data)
print(counts)
print(data)
target = data["injury_risk"]
data = data.drop("injury_risk",axis=1)
print(data)
data = (data-data.min())/(data.max()-data.min())
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.3)
split into test and train data
n_train=(train-train.min())/(train.max()-train.min())
n_test=(test-train.min())/(train.max()-train.min())
print(n_train)
print(n_test)
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = 0.3)# ,random_state = 2, stratify = target
print(X_train) print(X_test) print(y_train) print(y_test)
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)
from sklearn.linear_model import LinearRegression classifier = LinearRegression()#random_state = 0 classifier.fit(X_train, y_train)
Y_pred = classifier.predict(X_test) print(Y_pred)
Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(y_test, Y_pred)
sns.heatmap(cm, annot=True,fmt='d')
Select technical metrics for correlation analysis
float_columns = n_train.select_dtypes(include=['float64'])
technical_metrics = float_columns.columns.to_list()
Correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(n_train[technical_metrics].corr(), annot=False, cmap='coolwarm')
plt.title("Correlation Heatmap of Technical Metrics")
plt.show()
pd.set_option("display.max_rows", None)
Log in or sign up for Devpost to join the conversation.