In [5]:
import pandas as pd
football = pd.read_csv("football.csv")
football.head()
Out[5]:
| team_code | wins | losses | Points_For | yards/play | TO | pass td | rush td | 1st Downs | penalties | number drives | avg start | PtsA | opp yards/play | opp 1st downs | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | crd2005 | 5 | 11 | 311 | 5.2 | 37 | 21 | 2 | 304 | 145 | 194 | Own 29.5 | 387 | 5.1 | 272 |
| 1 | jax2019 | 6 | 10 | 300 | 5.4 | 20 | 24 | 3 | 298 | 132 | 177 | Own 25.9 | 397 | 6.0 | 340 |
| 2 | chi1999 | 6 | 10 | 272 | 4.9 | 37 | 25 | 4 | 302 | 117 | 205 | Own 32.5 | 341 | 5.4 | 310 |
| 3 | mia2017 | 6 | 10 | 281 | 4.9 | 29 | 24 | 4 | 284 | 137 | 190 | Own 28.1 | 393 | 5.4 | 319 |
| 4 | ram2009 | 1 | 15 | 175 | 4.5 | 33 | 12 | 4 | 259 | 100 | 181 | Own 28.1 | 436 | 5.9 | 333 |
In [6]:
#Cleaning up data
#Separate team and team code, also make the average start a single numeric field
#Get meaningful column names
football['team']=football['team_code'].str.slice(0,3)
football['year']=football['team_code'].str.slice(3,8)
football['avg start']=football['avg start'].str.slice(-4,-1)
football['avg_start'] = football['avg start'].astype(float)
football2 = football.rename(columns={
'yards/play': 'yards_play',
'opp yards/play': 'yards_play_opp',
'Points_For' : 'points',
'PtsA': 'points_opp',
#'avg start': 'avgStartString',
'pass td': 'pass_TD',
'rush td': 'rush_TD',
'1st Downs' : 'first_downs',
'opp 1st downs' : 'first_downs_opp',
'number drives' : 'drives'
})
# Move 'colC' to the front
teamcol = football2.pop('team')
yearcol=football2.pop('year')
football2.insert(0, 'year', yearcol)
football2.insert(0, 'team', teamcol)
football_cleaned = football2.drop(['team_code', 'avg start'], axis=1)
football_cleaned.head()
Out[6]:
| team | year | wins | losses | points | yards_play | TO | pass_TD | rush_TD | first_downs | penalties | drives | points_opp | yards_play_opp | first_downs_opp | avg_start | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | crd | 2005 | 5 | 11 | 311 | 5.2 | 37 | 21 | 2 | 304 | 145 | 194 | 387 | 5.1 | 272 | 29.0 |
| 1 | jax | 2019 | 6 | 10 | 300 | 5.4 | 20 | 24 | 3 | 298 | 132 | 177 | 397 | 6.0 | 340 | 25.0 |
| 2 | chi | 1999 | 6 | 10 | 272 | 4.9 | 37 | 25 | 4 | 302 | 117 | 205 | 341 | 5.4 | 310 | 32.0 |
| 3 | mia | 2017 | 6 | 10 | 281 | 4.9 | 29 | 24 | 4 | 284 | 137 | 190 | 393 | 5.4 | 319 | 28.0 |
| 4 | ram | 2009 | 1 | 15 | 175 | 4.5 | 33 | 12 | 4 | 259 | 100 | 181 | 436 | 5.9 | 333 | 28.0 |
In [7]:
#Which teams have the best wins differential 2000-2019?
football_avgpoints=football_cleaned.groupby('team').agg({'wins':'mean', 'losses':'mean'})
football_avgpoints['wins_diff']=(football_avgpoints['wins']-football_avgpoints['losses'])
football_avgpoints.sort_values('wins',ascending=False)
Out[7]:
| wins | losses | wins_diff | |
|---|---|---|---|
| team | |||
| nwe | 11.666667 | 4.333333 | 7.333333 |
| pit | 10.047619 | 5.857143 | 4.190476 |
| clt | 10.000000 | 6.000000 | 4.000000 |
| gnb | 9.761905 | 6.142857 | 3.619048 |
| rav | 9.428571 | 6.571429 | 2.857143 |
| phi | 9.285714 | 6.666667 | 2.619048 |
| sea | 9.095238 | 6.857143 | 2.238095 |
| den | 8.952381 | 7.047619 | 1.904762 |
| nor | 8.857143 | 7.142857 | 1.714286 |
| dal | 8.380952 | 7.619048 | 0.761905 |
| kan | 8.333333 | 7.666667 | 0.666667 |
| min | 8.333333 | 7.571429 | 0.761905 |
| oti | 8.238095 | 7.761905 | 0.476190 |
| sdg | 8.095238 | 7.904762 | 0.190476 |
| atl | 8.047619 | 7.904762 | 0.142857 |
| car | 7.857143 | 8.095238 | -0.238095 |
| nyg | 7.857143 | 8.142857 | -0.285714 |
| chi | 7.761905 | 8.238095 | -0.476190 |
| mia | 7.523810 | 8.476190 | -0.952381 |
| ram | 7.476190 | 8.476190 | -1.000000 |
| nyj | 7.428571 | 8.571429 | -1.142857 |
| htx | 7.277778 | 8.722222 | -1.444444 |
| sfo | 7.238095 | 8.714286 | -1.476190 |
| cin | 7.095238 | 8.761905 | -1.666667 |
| tam | 7.095238 | 8.904762 | -1.809524 |
| buf | 7.047619 | 8.952381 | -1.904762 |
| crd | 6.809524 | 9.095238 | -2.285714 |
| was | 6.761905 | 9.190476 | -2.428571 |
| jax | 6.714286 | 9.285714 | -2.571429 |
| rai | 6.333333 | 9.666667 | -3.333333 |
| det | 5.809524 | 10.142857 | -4.333333 |
| cle | 4.809524 | 11.142857 | -6.333333 |
In [8]:
#Let's see who has the best point and yards differential, and how are the two related/
avg_points = (
football_cleaned
.groupby("team")
.agg(
avg_points=("points","mean"),
avg_yards_play=("yards_play","mean"),
)
.sort_values("avg_points", ascending=False)
)
print(avg_points)
football_cleaned[["yards_play", "points"]].corr()
avg_points avg_yards_play team nwe 429.619048 5.471429 nor 407.666667 5.728571 gnb 401.619048 5.595238 clt 398.904762 5.571429 phi 381.619048 5.442857 sdg 379.428571 5.495238 kan 373.380952 5.504762 den 373.000000 5.471429 pit 366.571429 5.419048 sea 365.380952 5.314286 min 364.571429 5.471429 atl 358.142857 5.390476 rav 357.190476 5.000000 dal 356.523810 5.490476 ram 352.714286 5.395238 nyg 349.666667 5.366667 oti 344.095238 5.280952 car 342.428571 5.190476 sfo 334.285714 5.219048 htx 333.777778 5.188889 det 329.000000 5.100000 cin 329.000000 5.085714 tam 326.047619 5.176190 rai 324.857143 5.214286 chi 321.380952 4.914286 mia 318.142857 5.047619 crd 318.095238 5.004762 was 318.047619 5.257143 buf 318.000000 5.038095 jax 317.142857 5.100000 nyj 316.952381 5.023810 cle 274.000000 4.866667
Out[8]:
| yards_play | points | |
|---|---|---|
| yards_play | 1.000000 | 0.805235 |
| points | 0.805235 | 1.000000 |
In [19]:
#Let's see a graph!!!!
import matplotlib.pyplot as plt
import numpy as np
football_cleaned.plot.scatter(x="yards_play", y="points", alpha=0.5)
plt.title("Points vs. Yards per Play")
plt.xlabel("Yards per Play")
plt.ylabel("Points")
x = football_cleaned["yards_play"]
y = football_cleaned["points"]
m, b = np.polyfit(x, y, 1) # slope, intercept
plt.scatter(x, y, alpha=0.5)
plt.plot(x, m*x + b, color="red") # best-fit line
plt.title("Points vs. Yards per Play (with trendline)")
plt.xlabel("Yards per Play")
plt.ylabel("Points")
plt.show()
In [99]:
"""
Next Steps
With more time, I could extend the analysis by:
Examining defenses (fewest points allowed),
Comparing turnovers (TO) to wins,
Exploring year-over-year scoring trends,
Building charts for the top 10 offenses/defenses.
"""
Out[99]:
'\nNext Steps\nWith more time, I could extend the analysis by:\nExamining defenses (fewest points allowed),\nComparing turnovers (TO) to wins,\nExploring year-over-year scoring trends,\nBuilding charts for the top 10 offenses/defenses.\n'
In [57]:
#Hey how about we see a trend of how scoring has increased (or decreased) over the years?
avg_points_by_season=(
football_cleaned.groupby("year")["points"].mean().sort_index()
)
print(avg_points_by_season.round(1))
print("\n\nLooks like we got a bit of a trend. Can we model this with a linear trend line?")
plt.figure(figsize=(10, 6))
# Convert years to integers if they aren't already
years = np.array(avg_points_by_season.index).astype(int)
points = avg_points_by_season.values
# Plot the original data
plt.plot(years, points, 'b-', marker='o', label='Average Points')
# Calculate and plot the trend line
m, b = np.polyfit(years, points, 1)
plt.plot(years, m*years + b, 'g-', label='Trend')
# Format the x-axis to show years properly
plt.xlabel('Year')
plt.ylabel('Average Points')
plt.title('Average Points by Season with Trend')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.xticks(years[::2], rotation=45)
plt.text(years.min(), points.max(), f"y = {m:.2f}x + {b:.1f}", color="green")
plt.show()
print("On average every year the average total points for all teams increases by " + str(round(m,1)) + " points.")
year 1999 333.0 2000 330.8 2001 323.4 2002 346.8 2003 333.3 2004 343.8 2005 329.9 2006 330.5 2007 347.0 2008 352.5 2009 343.5 2010 352.6 2011 354.9 2012 364.1 2013 374.5 2014 361.4 2015 365.0 2016 364.4 2017 347.5 2018 373.5 2019 365.0 Name: points, dtype: float64 Looks like we got a bit of a trend. Can we model this with a linear trend line?
On average every year the average total points for all teams increases by 2.1 points.