import pandas as pd
football = pd.read_csv("football.csv")
football.head()

#Cleaning up data
#Separate team and team code, also make the average start a single numeric field
#Get meaningful column names

football['team']=football['team_code'].str.slice(0,3)
football['year']=football['team_code'].str.slice(3,8)
football['avg start']=football['avg start'].str.slice(-4,-1)
football['avg_start'] = football['avg start'].astype(float)

football2 = football.rename(columns={
    'yards/play': 'yards_play',
    'opp yards/play': 'yards_play_opp',
    'Points_For' : 'points',
    'PtsA': 'points_opp',
    #'avg start': 'avgStartString',
    'pass td': 'pass_TD',
    'rush td': 'rush_TD',
    '1st Downs' : 'first_downs',
    'opp 1st downs' : 'first_downs_opp',
    'number drives' : 'drives'
})


# Move 'colC' to the front
teamcol = football2.pop('team')
yearcol=football2.pop('year')

football2.insert(0, 'year', yearcol)
football2.insert(0, 'team', teamcol)

football_cleaned = football2.drop(['team_code', 'avg start'], axis=1)
football_cleaned.head()

#Which teams have the best wins differential 2000-2019?
football_avgpoints=football_cleaned.groupby('team').agg({'wins':'mean', 'losses':'mean'})
football_avgpoints['wins_diff']=(football_avgpoints['wins']-football_avgpoints['losses'])
football_avgpoints.sort_values('wins',ascending=False)

#Let's see who has the best point and yards differential, and how are the two related/

avg_points = (
    football_cleaned
    .groupby("team")
    .agg(
        avg_points=("points","mean"),
        avg_yards_play=("yards_play","mean"),
    )
    .sort_values("avg_points", ascending=False)
)

print(avg_points)
football_cleaned[["yards_play", "points"]].corr()

      avg_points  avg_yards_play
team                            
nwe   429.619048        5.471429
nor   407.666667        5.728571
gnb   401.619048        5.595238
clt   398.904762        5.571429
phi   381.619048        5.442857
sdg   379.428571        5.495238
kan   373.380952        5.504762
den   373.000000        5.471429
pit   366.571429        5.419048
sea   365.380952        5.314286
min   364.571429        5.471429
atl   358.142857        5.390476
rav   357.190476        5.000000
dal   356.523810        5.490476
ram   352.714286        5.395238
nyg   349.666667        5.366667
oti   344.095238        5.280952
car   342.428571        5.190476
sfo   334.285714        5.219048
htx   333.777778        5.188889
det   329.000000        5.100000
cin   329.000000        5.085714
tam   326.047619        5.176190
rai   324.857143        5.214286
chi   321.380952        4.914286
mia   318.142857        5.047619
crd   318.095238        5.004762
was   318.047619        5.257143
buf   318.000000        5.038095
jax   317.142857        5.100000
nyj   316.952381        5.023810
cle   274.000000        4.866667

#Let's see a graph!!!!

import matplotlib.pyplot as plt
import numpy as np


football_cleaned.plot.scatter(x="yards_play", y="points", alpha=0.5)

plt.title("Points vs. Yards per Play")
plt.xlabel("Yards per Play")
plt.ylabel("Points")

x = football_cleaned["yards_play"]
y = football_cleaned["points"]

m, b = np.polyfit(x, y, 1)  # slope, intercept

plt.scatter(x, y, alpha=0.5)
plt.plot(x, m*x + b, color="red")  # best-fit line

plt.title("Points vs. Yards per Play (with trendline)")
plt.xlabel("Yards per Play")
plt.ylabel("Points")
plt.show()

"""
Next Steps
With more time, I could extend the analysis by:
Examining defenses (fewest points allowed),
Comparing turnovers (TO) to wins,
Exploring year-over-year scoring trends,
Building charts for the top 10 offenses/defenses.
"""

'\nNext Steps\nWith more time, I could extend the analysis by:\nExamining defenses (fewest points allowed),\nComparing turnovers (TO) to wins,\nExploring year-over-year scoring trends,\nBuilding charts for the top 10 offenses/defenses.\n'

#Hey how about we see a trend of how scoring has increased (or decreased) over the years?

avg_points_by_season=(
    football_cleaned.groupby("year")["points"].mean().sort_index()
)
print(avg_points_by_season.round(1))
print("\n\nLooks like we got a bit of a trend. Can we model this with a linear trend line?")

plt.figure(figsize=(10, 6))

# Convert years to integers if they aren't already
years = np.array(avg_points_by_season.index).astype(int)
points = avg_points_by_season.values

# Plot the original data
plt.plot(years, points, 'b-', marker='o', label='Average Points')

# Calculate and plot the trend line
m, b = np.polyfit(years, points, 1)
plt.plot(years, m*years + b, 'g-', label='Trend')

# Format the x-axis to show years properly

plt.xlabel('Year')
plt.ylabel('Average Points')
plt.title('Average Points by Season with Trend')
plt.legend()
plt.grid(True)
plt.tight_layout()

plt.xticks(years[::2], rotation=45)
plt.text(years.min(), points.max(), f"y = {m:.2f}x + {b:.1f}", color="green")
plt.show()

print("On average every year the average total points for all teams increases by " + str(round(m,1)) + " points.")

year
1999    333.0
2000    330.8
2001    323.4
2002    346.8
2003    333.3
2004    343.8
2005    329.9
2006    330.5
2007    347.0
2008    352.5
2009    343.5
2010    352.6
2011    354.9
2012    364.1
2013    374.5
2014    361.4
2015    365.0
2016    364.4
2017    347.5
2018    373.5
2019    365.0
Name: points, dtype: float64


Looks like we got a bit of a trend. Can we model this with a linear trend line?

On average every year the average total points for all teams increases by 2.1 points.

	wins	losses	wins_diff
team
nwe	11.666667	4.333333	7.333333
pit	10.047619	5.857143	4.190476
clt	10.000000	6.000000	4.000000
gnb	9.761905	6.142857	3.619048
rav	9.428571	6.571429	2.857143
phi	9.285714	6.666667	2.619048
sea	9.095238	6.857143	2.238095
den	8.952381	7.047619	1.904762
nor	8.857143	7.142857	1.714286
dal	8.380952	7.619048	0.761905
kan	8.333333	7.666667	0.666667
min	8.333333	7.571429	0.761905
oti	8.238095	7.761905	0.476190
sdg	8.095238	7.904762	0.190476
atl	8.047619	7.904762	0.142857
car	7.857143	8.095238	-0.238095
nyg	7.857143	8.142857	-0.285714
chi	7.761905	8.238095	-0.476190
mia	7.523810	8.476190	-0.952381
ram	7.476190	8.476190	-1.000000
nyj	7.428571	8.571429	-1.142857
htx	7.277778	8.722222	-1.444444
sfo	7.238095	8.714286	-1.476190
cin	7.095238	8.761905	-1.666667
tam	7.095238	8.904762	-1.809524
buf	7.047619	8.952381	-1.904762
crd	6.809524	9.095238	-2.285714
was	6.761905	9.190476	-2.428571
jax	6.714286	9.285714	-2.571429
rai	6.333333	9.666667	-3.333333
det	5.809524	10.142857	-4.333333
cle	4.809524	11.142857	-6.333333

Python Data Science Project

The following is a portfolio project from CodeCademy Python for Data Science. It consists of opening NFL statistics from a csv file and then performing transformations and data analysis.

	team_code	wins	losses	Points_For	yards/play	TO	pass td	rush td	1st Downs	penalties	number drives	avg start	PtsA	opp yards/play	opp 1st downs
0	crd2005	5	11	311	5.2	37	21	2	304	145	194	Own 29.5	387	5.1	272
1	jax2019	6	10	300	5.4	20	24	3	298	132	177	Own 25.9	397	6.0	340
2	chi1999	6	10	272	4.9	37	25	4	302	117	205	Own 32.5	341	5.4	310
3	mia2017	6	10	281	4.9	29	24	4	284	137	190	Own 28.1	393	5.4	319
4	ram2009	1	15	175	4.5	33	12	4	259	100	181	Own 28.1	436	5.9	333

	team	year	wins	losses	points	yards_play	TO	pass_TD	rush_TD	first_downs	penalties	drives	points_opp	yards_play_opp	first_downs_opp	avg_start
0	crd	2005	5	11	311	5.2	37	21	2	304	145	194	387	5.1	272	29.0
1	jax	2019	6	10	300	5.4	20	24	3	298	132	177	397	6.0	340	25.0
2	chi	1999	6	10	272	4.9	37	25	4	302	117	205	341	5.4	310	32.0
3	mia	2017	6	10	281	4.9	29	24	4	284	137	190	393	5.4	319	28.0
4	ram	2009	1	15	175	4.5	33	12	4	259	100	181	436	5.9	333	28.0

	yards_play	points
yards_play	1.000000	0.805235
points	0.805235	1.000000