Track & Field Performance Analysis

Project Details

Overview

This project collects and analyzes top NCAA Division III track and field performances from 2011–2025. It scrapes official results from TFRRS, cleans and converts performance data, applies wind adjustments for sprint events, calculates yearly averages, and visualizes trends across events.

Key Features

Scrapes and caches NCAA Division III track & field results for multiple years to reduce repeated requests.
Processes top 50 performances per event, including wind-adjusted times for sprint events.
Generates and saves visualizations of yearly performance trends for each event from 2011 to 2025.

Technologies Used

Python (requests, BeautifulSoup, pandas, matplotlib)
Data cleaning and preprocessing, including custom time and wind conversion functions
File handling for caching HTML pages and saving visualizations locally

Analysis

The data reveals a significant improvement in nearly every event in 2022, soon after the end of the COVID-19 pandemic and the addition of new Name, Image, and Likeness (NIL) policies to the NCAA. The pandemic allowed many athletes who missed a year of competition to receive a "covid year", an additional year of eligibility, while NIL gives athletes the opportunity to be sponsored and receive compensation.

Although 2024 was technically the final year of competition for athletes with an additional "covid year", there was still an improvement in performance in nearly all events. I argue that these are likely due to NIL policies. Since athletes in NCAA division 3 cannot receive scholarships, there was little to no financial incentive for top athletes to continue competing. However, top athletes now have the opportunity to monetize their continued athletic career.

Additional research would be necessary to further support this claim including research on NIL deals in NCAA division 3, comparison of improvements seen in divisions 1 and 2 where financial incentive was previously present (scholarships), and investigation into high school times to see if there is an overall improvement in track and field occurring beyond collegiate performances.

Sample Visualizations

All visualizations can be found here

Code

Parsing HTML and Caching Data Locally

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
from convert_time import wind_conversion as w_conv
import matplotlib.pyplot as plt
import os
from dict_events import dict_events_function as events_dict

year_urls = {
    2025: "https://tfrrs.org/lists/5020",
    2024: "https://www.tfrrs.org/lists/4517/2024_NCAA_Division_III_Outdoor_Qualifying_FINAL",
    2023: "https://m.tfrrs.org/lists/4043/2023_NCAA_Division_III_Outdoor_Qualifying_FINAL",
    2022: "https://tfrrs.org/lists/3714/2022_NCAA_Division_III_Outdoor_Qualifying_FINAL",
    2021: "https://soap.tfrrs.org/lists/3195/2021_NCAA_Division_III_Outdoor_Qualifying_FINAL",
    2020: "https://tfrrs.org/lists/2907/2020_NCAA_Div_III_Outdoor_Qualifying",
    2019: "https://tfrrs.org/lists/2572/2019_NCAA_Div_III_Outdoor_Qualifying_FINAL",
    2018: "https://tfrrs.org/lists/2283/2018_NCAA_Div_III_Outdoor_Qualifying_FINAL",
    2017: "https://www.tfrrs.org/lists/1914/2017_NCAA_Div_III_Outdoor_Qualifying_FINAL",
    2016: "https://mobile.tfrrs.org/lists/1684/2016_NCAA_Division_III_Outdoor_Qualifying_FINAL",
    2015: "https://m.tfrrs.org/lists/1443/2015_NCAA_Division_III_Outdoor_Qualifying_FINAL",
    2014: "https://www.tfrrs.org/lists/1232/2014_NCAA_Division_III_Outdoor_Qualifying_FINAL",
    2013: "https://tfrrs.org/lists/1033/2013_NCAA_Division_III_Outdoor_Qualifying_FINAL",
    2012: "https://tfrrs.org/lists/842/2012_NCAA_Div_III_Outdoor_Qualifier_List",
    2011: "https://tfrrs.org/lists/675/2011_NCAA_Division_III_Outdoor_POP_List_FINAL"
}

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36"
}
def time_to_float(time_str):
    minutes, rest = time_str.split(":")
    minutes = int(minutes)
    seconds = float(rest)
    
    total_seconds = minutes * 60 + seconds
    return total_seconds
def float_to_time(seconds_float):
    minutes = int(seconds_float // 60)
    seconds = seconds_float % 60  
    
    return f"{minutes:02d}:{seconds:05.2f}"

def fetch_html(url, year):
    """Fetch page or load from local cache if available"""
    folder = "html_cache"
    os.makedirs(folder, exist_ok=True)
    path = os.path.join(folder, f"{year}.html")
    
    if os.path.exists(path):
        with open(path, "r") as f:
            html = f.read()
    else:
        response = requests.get(url, headers=HEADERS)
        if response.status_code != 200:
            raise Exception(f"Failed to fetch {year}")
        html = response.text
        with open(path, "w") as f:
            f.write(html)
        time.sleep(random.uniform(2,5))
    return html

def parse_top_50_times(html, event_class,event):
    """Parse HTML and return top 50 legal times with wind conversion"""
    soup = BeautifulSoup(html, "html.parser")
    times = []
    
    event_block = soup.find("div", class_=event_class)
    if not event_block:
        print(f"⚠️ Could not find event block: {event_class}")
        return times

    rows = event_block.select(".performance-list-body > div")
    needs_conversion=("Men's 100m", "Women's 100m","Men's 200m","Women's 200m")
    for row in rows:
        try:
            columns = row.select("div")
            if "x" in event:
                time_str=columns[2].text.strip()
            else:
                time_str = columns[4].text.strip()
            
            
            if ":" in time_str:
                mark = time_to_float(time_str)
            elif "m" in time_str:
                mark = float(time_str[:-1])
            else:
                mark = float(time_str)
            if event in needs_conversion:
                wind_str = columns[7].text.strip() if len(columns) > 7 else "0"
                wind_val = float(wind_str.replace("+", "").replace("w", "")) if wind_str else 0.0
                mark = w_conv(mark, wind_val)


            # Apply wind conversion
            times.append(mark)
            
            if len(times) >= 50:
                break
        except (ValueError, IndexError) as e:
            continue
    
    return times
yearly_averages = []
all_times = {}
events_dict=events_dict()
dont_convert=("Men's Decathlon","Women's Heptathlon")

for year, url in year_urls.items():
    print(f"Processing {year}...")
    try:
        html = fetch_html(url, year)
        for event,class_id in events_dict.items():
            times = parse_top_50_times(html,class_id,event)
            all_times[year] = times
            if times:
                avg_time = sum(times) / len(times)
                yearly_averages.append({"year": year, "event": event, "avg_time": avg_time})
                if avg_time>60 and event not in dont_convert:
                    avg_time=float_to_time(avg_time)
                else:
                    avg_time=round(avg_time,2)
            else:
                print(f"No marks found for {event}")
        
    except Exception as e:
        print(f"Error processing {year}: {e}")

df = pd.DataFrame(yearly_averages).sort_values("year")
df.to_csv("d3_averages_2011-2025.csv", index=False)

Plotting Data and Storing Plots

import pandas as pd
import matplotlib.pyplot as plt
import os
import convert_time as conv
df = pd.read_csv("d3_averages_2011-2025.csv") 

# Create output folder for plots
output_folder = "event_plots"
os.makedirs(output_folder, exist_ok=True)

# Define sprint events for which lower time is better
multi=("Men's Decathlon", "Women's Heptathlon")
sprints=("Men's 100m","Women's 100m","Men's 200m","Women's 200m","Men's 400m","Women's 400m", "Men's 110m Hurdles", "Women's 100m Hurdles", "Men's 400m Hurdles","Women's 400m Hurdles","Women's 4x100m Relay","Men's 4x100m Relay")
jumps=("Men's High Jump", "Women's High Jump","Men's Long Jump","Women's Long Jump","Men's Triple Jump","Women's Triple Jump","Men's Pole Vault","Women's Pole Vault")
mid_distance=("Men's 800m","Women's 800m", "Men's 4x400m Relay", "Women's 4x400m Relay","Men's 1500m", "Women's 1500m","Men's 3000m Steeplechase","Women's 3000m Steeplechase")
distance=("Men's 5000m","Women's 5000m","Men's 10000m", "Women's 10000m")

# Loop through each unique event
for event in df['event'].unique():
    # Select data for this event
    event_data = df[df['event'] == event].sort_values('year')
    
    # Plot
    plt.figure(figsize=(8,5))
    plt.plot(event_data['year'], event_data['avg_time'], marker='o', linestyle='-')
    plt.title(f"Progression of {event} From 2011 to 2025")
    plt.xlabel("Year")
    if any(event in t for t in (sprints, mid_distance, distance)):
        plt.gca().invert_yaxis()
        plt.ylabel("Average Time (s)")
    elif event in multi:
        plt.ylabel("Average Score (pts)")
    else:
        plt.ylabel("Average Mark (m)")
    
    plt.grid(True)
    plt.tight_layout()
    
    # Save figure
    filename = f"{output_folder}/{event.replace(' ','_')}.png"
    plt.savefig(filename)
    plt.close()
    
print(f"Saved plots for {len(df['event'].unique())} events in '{output_folder}' folder.")

Creating Dictionary for Parsing Data Based on TFRRS html Format

def dict_events_function():
li_events_class=[]
event_names=[]
to_skip=(8,10,14,15,16,17,18,20,32,34,35,36,37,38)
men_only=(5,39)
women_only=(4,40)
for i in range(4,41):
    if i not in to_skip and i not in men_only and i not in women_only:
        li_events_class.append(f"row gender_m standard_event_hnd_{i}")
        li_events_class.append(f"row gender_f standard_event_hnd_{i}")
    elif i in women_only:
        li_events_class.append(f"row gender_f standard_event_hnd_{i}")
    elif i in men_only:
        li_events_class.append(f"row gender_m standard_event_hnd_{i}")
event_names.append("Women's 100m Hurdles")
event_names.append("Men's 110m Hurdles")
event_names.append("Men's 100m")
event_names.append("Women's 100m")
event_names.append("Men's 200m")
event_names.append("Women's 200m")
event_names.append("Men's 400m Hurdles")
event_names.append("Women's 400m Hurdles")
event_names.append("Men's 400m")
event_names.append("Women's 400m")
event_names.append("Men's 800m")
event_names.append("Women's 800m")
event_names.append("Men's 1500m")
event_names.append("Women's 1500m")
event_names.append("Men's 3000m Steeplechase")
event_names.append("Women's 3000m Steeplechase")
event_names.append("Men's 5000m")
event_names.append("Women's 5000m")
event_names.append("Men's 10000m")
event_names.append("Women's 10000m")
event_names.append("Men's High Jump")
event_names.append("Women's High Jump")
event_names.append("Men's Pole Vault")
event_names.append("Women's Pole Vault")
event_names.append("Men's Long Jump")
event_names.append("Women's Long Jump")
event_names.append("Men's Triple Jump")
event_names.append("Women's Triple Jump")
event_names.append("Men's Discus")
event_names.append("Women's Discus")
event_names.append("Men's Hammer")
event_names.append("Women's Hammer")
event_names.append("Men's Javelin")
event_names.append("Women's Javelin")
event_names.append("Men's Shot Put")
event_names.append("Women's Shot Put")
event_names.append("Men's 4x100m Relay")
event_names.append("Women's 4x100m Relay")
event_names.append("Men's 4x400m Relay")
event_names.append("Women's 4x400m Relay")
event_names.append("Men's Decathlon")
event_names.append("Women's Heptathlon")
dict_events=dict(zip(event_names, li_events_class))
return dict_events