#import libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import datetime as dt
#load dataframe consisting of boxscores
#or initialize if not available
#define date_parser
date_parser = lambda d: pd.to_datetime(d, format = '%Y-%m-%d')
#try to load
try:
all_games = pd.read_csv('all_games_box_score.csv',
index_col = 0,
parse_dates = ['Date'],
date_format = '%Y-%m-%d'
)
#transform Date column to only contain date
all_games['Date'] = all_games.Date.dt.date
#initialize
except:
all_games = pd.DataFrame()
all_games
tableDataText function from stackoverflow user imbr
link: https://stackoverflow.com/questions/2935658/beautifulsoup-get-the-contents-of-a-specific-table
#Define functions needed to parse and create dataframe
def tableDataText(table):
"""Parses a html segment started with tag <table> followed
by multiple <tr> (table rows) and inner <td> (table data) tags.
It returns a list of rows with inner columns.
Accepts only one <th> (table header/data) in the first row.
"""
def rowgetDataText(tr, coltag='td'): # td (data) or th (header)
return [td.get_text(strip=False) for td in tr.find_all(coltag)]
rows = []
trs = table.find_all('tr')
headerow = rowgetDataText(trs[0], 'th')
if headerow: # if there is a header row include first
rows.append(headerow)
trs = trs[1:]
for tr in trs: # for every table row
rows.append(rowgetDataText(tr, 'td') ) # data row
return rows
def change_minutes(minutes):
""" Converts minutes from string format to float, then converts from
mm:ss format to float format.
Returns a float.
"""
try:
minutes = [float(t) for t in minutes.split(':')]
return np.round(minutes[0] + (minutes[1] / 60), 2)
#for players who did not play, we need to return a null value
except:
return None
def is_win(Team, Opponent):
"""This function will compare the dataframes
of both teams to determine the winning team.
Returns an int.
"""
if Team.Pts.sum() > Opponent.Pts.sum():
return 1
else:
return 0
def create_df(team, i):
"""Creates a dataframe after loading website data.
Returns a dataframe.
"""
#create a local variable of the teams soup
teams_local = teams.copy()
#get name of team from teams_local soup
team_name = teams_local[i]
#use tableDataText to extract table from team soup
team_list = tableDataText(team)
#create dataframe from team_list
#discard rows with totals and coaches
df = pd.DataFrame(team_list[1:],
columns = team_list[0]).replace({'\n' : ' ' , '%' : ''},
regex = True)[:-2]
#remove whitespace from player names
df['Players'] = df.Players.str.strip()
#split up 2Pt column into makes, attempts, and percentage
#repeat process for 3pt, FG, FT
df[['2ptM', '2PtA']] = df['2Pts'].str.split('/',
n = 1,
expand = True,
regex = True)
df[['2PtA', '2Pt%']] = df['2PtA'].str.split(' ',
n = 1,
expand = True,
regex = True)
df.drop('2Pts', axis = 1, inplace = True)
df[['3ptM', '3PtA']] = df['3Pts'].str.split('/',
n = 1,
expand = True,
regex = True)
df[['3PtA', '3Pt%']] = df['3PtA'].str.split(' ',
n = 1,
expand = True,
regex = True)
df.drop('3Pts', axis = 1, inplace = True)
df[['FGM', 'FGA']] = df['FG'].str.split('/',
n = 1,
expand = True,
regex = True)
df[['FGA', 'FG%']] = df['FGA'].str.split(' ',
n = 1,
expand = True,
regex = True)
df.drop('FG', axis = 1, inplace = True)
df[['FTM', 'FTA']] = df['FT'].str.split('/',
n = 1,
expand = True,
regex = True)
df[['FTA', 'FT%']] = df['FTA'].str.split(' ',
n = 1,
expand = True,
regex = True)
df.drop('FT', axis = 1, inplace = True)
#convert min from mm:ss string format to float
df['Min'] = df['Min'].apply(lambda x: change_minutes(x))
#Convert numeric columns from string to float
float_cols = list(df.columns.drop(['#', 'Players', 'Min']))
df = df.replace('None', None)
df[float_cols] = df[float_cols].apply(pd.to_numeric,
errors = 'coerce').astype('float')
#include team name to dataframe
df['Team'] = team_name
return df
#define dictionary with game dates and url of boxscores
#we will inlcude all games for this year in case
#someone wants to try out the code
url_dict = {dt.date(2023, 8, 25) : ['https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/31e73809-4c71-4a56-bb9b-dc261d23b136',
'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/1e128916-459f-4930-866b-543f92495685',
'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/89cf255f-080b-4a22-b43e-3ff293b68037',
'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/a0c9effd-2407-416d-ad49-259c8ac29e5d',
'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/0f5d5e8c-603f-4ede-a8ab-e433f0be7167',
'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/a6861ad5-55f1-4e51-a624-0f54092ebdcb',
'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/cdd3ad9f-0d27-455d-8a25-3951d26181f4',
'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/840cae94-aca7-40b4-935e-8fc4e82f807f'],
dt.date(2023, 8, 26) : ['https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/39bb540e-ee1e-450b-be27-cf936e9299b2',
'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/a3fad431-af47-4e21-84b5-f15783d76a22',
'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/9c9dfa09-a5af-4be9-9058-b473b5bd33dc',
'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/c87e8b52-75ea-40d3-9b51-87058c833957',
'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/598bb5b0-6e68-41fc-910e-77b4238fcc97',
'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/20751354-ba34-48d2-81f6-4be3fb02fba8',
'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/cde478ad-3717-45fa-a264-52bf4dcae611',
'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/0a891362-a31a-44a2-a06a-1dfe0579825c'],
dt.date(2023, 8, 27) : ['https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/ec23be52-3379-441d-ae38-228b4d48188a',
'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/30ba1588-ee1c-4436-8433-9cb13dbf30b3',
'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/39e49762-096f-45d3-b5e0-0dd04b3c9838',
'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/47f06d36-62cc-4965-8c58-7673bf30844b',
'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/5f2fe4c6-46c0-4af5-98d1-ad9fbe6942bb',
'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/596abe23-7993-46c5-b1ad-e85f15d7974f',
'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/71956bce-97be-47dc-bfe8-d9adb45e66b9',
'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/7dbb777a-8853-4c6d-ac16-27327b7882e8'],
dt.date(2023, 8, 28) : ['https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/568e3c92-4549-4922-bf54-c77dd6290b8d',
'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/6194ea1f-02b9-4699-9d0f-017fa3c81b00',
'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/a6d46b94-7f63-4e05-9a7f-c9a031c7e2bc',
'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/0d342700-4b2c-40f7-b692-42256693a10c',
'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/dd1dcd25-80b6-411b-b3f7-64866e17295e',
'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/e812d5c1-8e07-4021-bca4-526a32a19bf2',
'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/93021800-7cd0-406a-a07b-053380116566',
'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/7effc70f-9348-49a6-b92b-f23b36f751ec'],
dt.date(2023, 8, 29) : ['https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/062985ae-bb35-4981-ade1-e51f2675ae51',
'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/59fc340b-82fd-4869-8c58-79b9b39f9581',
'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/37fbf32a-e08f-4026-b72b-6a7b9f3f1dbe',
'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/cbecedb1-de1b-490b-ac43-f4c57a22f84c',
'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/c089f75a-ab6a-4d0e-8a72-707e78121d37',
'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/919b5db8-fc9f-4416-9a94-f1498c42c0f6',
'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/1e04f909-3956-4184-989a-e60fc3a039da',
'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/cbb618a7-1c67-4ea5-9b5a-053c71eeaa36'],
dt.date(2023, 8, 30) : ['https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/fb3554af-0f3a-4ad4-84a8-810db71c7330',
'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/ea102bf1-2235-4e04-93c5-236064376244',
'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/fea657b9-1d62-4cb5-8b03-0935da45aee8',
'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/ad6a9ea3-c9cb-4f62-9d41-69a791689431',
'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/9b0e0f40-ba54-4376-8212-11d8a74eba89',
'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/e8d24996-9432-49bb-833c-34e93c4fb11e',
'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/0f896808-2ec8-45f2-9b4c-da81300aebf9',
'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/30ff5f55-1e86-4de7-ac3a-c67ea63e45e4']}
#create the dataframe that contains all boxscores
#initialize loop that will loop through the dictionary url_dict
for date, url_list in url_dict.items():
#loop through the list of games during the date
for url in url_list:
#scrape data from url
response = requests.get(url).text
#initialize BeautifulSoup to scrape data
soup = BeautifulSoup(response, 'html.parser')
#box scores are separated by "team-name" class, separate the teams
teams = [team.get_text() for team in soup.find_all('span', {'class' : 'team-name'})]
team_A = soup.find_all('section')[0].find('table')
team_B = soup.find_all('section')[1].find('table')
#create dataframes for both teams
A = create_df(team_A,0)
B = create_df(team_B, 1)
#create 'is_win' column
A['is_win'] = is_win(A,B)
B['is_win'] = is_win(B,A)
#create 'Date' column
A['Date'] = date
B['Date'] = date
#find starters and label
starters = [starter.find('a').get_text().strip() for starter in soup.find_all('tr', {'class', 'x--player-is-starter'})]
A['is_starter'] = A.Players.apply(lambda x: 1 if x in starters else 0)
B['is_starter'] = B.Players.apply(lambda x: 1 if x in starters else 0)
#extend dataframes to include opponent data
match_A = pd.concat([A,B.add_prefix('Opp_')]
,axis = 1).drop(['Opp_#']
, axis = 1)
match_B = pd.concat([B, A.add_prefix('Opp_')]
,axis = 1).drop(['Opp_#']
, axis = 1)
#concatinate main dataframe with dataframes of both teams
all_games = pd.concat([all_games, match_A, match_B]
).drop_duplicates(subset = ['Players', 'Team', 'Opp_Team', 'Date']
).drop(['Opp_Date'], axis = 1)
#save dataframe to csv
all_games.to_csv('all_games_box_score.csv')
#show sample of resulting dataframe
all_games.sample(10).transpose()
10 | 9 | 8 | 5 | 5 | 3 | 3 | 10 | 7 | 8 | |
---|---|---|---|---|---|---|---|---|---|---|
# | 42 | 41 | 21 | 9 | 11 | 5 | 13 | 30 | 20 | 15 |
Players | Andreas Obst | Arnaldo Toro Barea | Justus Hollatz | Giorgi Shermadini | Nemanja Radovic | Amr El Gendy | Olivier Nkamhoua | Petar Popovic | Donatas Motiejunas | Kevin Coronel |
Min | 25.57 | 1.42 | 1.77 | 12.18 | 12.4 | 22.02 | 21.92 | 23.32 | 17.08 | NaN |
Pts | 2.0 | 0.0 | 0.0 | 4.0 | 7.0 | 6.0 | 7.0 | 8.0 | 8.0 | NaN |
OREB | 0.0 | 1.0 | 0.0 | 2.0 | 1.0 | 1.0 | 1.0 | 0.0 | 0.0 | NaN |
DREB | 2.0 | 0.0 | 0.0 | 4.0 | 2.0 | 5.0 | 5.0 | 0.0 | 3.0 | NaN |
REB | 2.0 | 1.0 | 0.0 | 6.0 | 3.0 | 6.0 | 6.0 | 0.0 | 3.0 | NaN |
AST | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 5.0 | 3.0 | 2.0 | 1.0 | NaN |
PF | 1.0 | 0.0 | 0.0 | 0.0 | 3.0 | 1.0 | 1.0 | 1.0 | 3.0 | NaN |
TO | 2.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 3.0 | 2.0 | 0.0 | NaN |
STL | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 1.0 | 1.0 | NaN |
BLK | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | NaN |
+/- | 2.0 | -2.0 | -2.0 | -16.0 | -8.0 | 12.0 | -6.0 | -8.0 | 19.0 | NaN |
EFF | -4.0 | 0.0 | -1.0 | 7.0 | 5.0 | 11.0 | 13.0 | 7.0 | 11.0 | NaN |
2ptM | 0.0 | 0.0 | 0.0 | 1.0 | 2.0 | 2.0 | 2.0 | 3.0 | 2.0 | NaN |
2PtA | 1.0 | 1.0 | 1.0 | 5.0 | 4.0 | 3.0 | 3.0 | 4.0 | 4.0 | NaN |
2Pt% | 0.0 | 0.0 | 0.0 | 20.0 | 50.0 | 66.7 | 66.7 | 75.0 | 50.0 | NaN |
3ptM | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | NaN |
3PtA | 4.0 | 0.0 | 0.0 | 0.0 | 3.0 | 3.0 | 2.0 | 1.0 | 1.0 | NaN |
3Pt% | 0.0 | NaN | NaN | NaN | 33.3 | 0.0 | 50.0 | 0.0 | 100.0 | NaN |
FGM | 0.0 | 0.0 | 0.0 | 1.0 | 3.0 | 2.0 | 3.0 | 3.0 | 3.0 | NaN |
FGA | 5.0 | 1.0 | 1.0 | 5.0 | 7.0 | 6.0 | 5.0 | 5.0 | 5.0 | NaN |
FG% | 0.0 | 0.0 | 0.0 | 20.0 | 42.9 | 33.3 | 60.0 | 60.0 | 60.0 | NaN |
FTM | 2.0 | 0.0 | 0.0 | 2.0 | 0.0 | 2.0 | 0.0 | 2.0 | 1.0 | NaN |
FTA | 3.0 | 0.0 | 0.0 | 2.0 | 0.0 | 3.0 | 0.0 | 2.0 | 2.0 | NaN |
FT% | 66.7 | NaN | NaN | 100.0 | NaN | 66.7 | NaN | 100.0 | 50.0 | NaN |
Team | Germany | Puerto Rico | Germany | Georgia | Montenegro | Egypt | Finland | Montenegro | Lithuania | Cape Verde |
is_win | 1 | 1 | 1 | 0 | 1 | 1 | 0 | 0 | 1 | 0 |
Date | 2023-08-27 | 2023-08-26 | 2023-08-25 | 2023-08-28 | 2023-08-25 | 2023-08-29 | 2023-08-27 | 2023-08-29 | 2023-08-29 | 2023-08-30 |
is_starter | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 |
Opp_Players | Nick Kay | Koch Bar | Shuta Hara | Gregor Hrovat | Pako Cruz | Jorge GutiƩrrez | Yuta Watanabe | Tomas Dimsa | Marko Simonovic | Bine Prepelic |
Opp_Min | 25.58 | 11.88 | 17.28 | 12.92 | 35.28 | 4.23 | 29.55 | 22.1 | 15.17 | 27.93 |
Opp_Pts | 5.0 | 0.0 | 0.0 | 6.0 | 16.0 | 0.0 | 4.0 | 9.0 | 13.0 | 11.0 |
Opp_OREB | 2.0 | 0.0 | 1.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 3.0 | 0.0 |
Opp_DREB | 1.0 | 2.0 | 0.0 | 1.0 | 2.0 | 1.0 | 0.0 | 0.0 | 3.0 | 4.0 |
Opp_REB | 3.0 | 2.0 | 1.0 | 2.0 | 3.0 | 1.0 | 0.0 | 0.0 | 6.0 | 4.0 |
Opp_AST | 3.0 | 1.0 | 0.0 | 0.0 | 6.0 | 0.0 | 2.0 | 1.0 | 0.0 | 0.0 |
Opp_PF | 3.0 | 1.0 | 0.0 | 3.0 | 2.0 | 1.0 | 1.0 | 2.0 | 1.0 | 1.0 |
Opp_TO | 2.0 | 0.0 | 0.0 | 0.0 | 5.0 | 1.0 | 2.0 | 0.0 | 1.0 | 0.0 |
Opp_STL | 1.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
Opp_BLK | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 0.0 | 1.0 | 0.0 |
Opp_+/- | -12.0 | 8.0 | -6.0 | 17.0 | -17.0 | -15.0 | -2.0 | 18.0 | -6.0 | 18.0 |
Opp_EFF | 6.0 | 4.0 | -1.0 | 8.0 | 12.0 | -1.0 | -1.0 | 5.0 | 14.0 | 12.0 |
Opp_2ptM | 1.0 | 0.0 | 0.0 | 0.0 | 6.0 | 0.0 | 1.0 | 2.0 | 3.0 | 3.0 |
Opp_2PtA | 3.0 | 0.0 | 0.0 | 0.0 | 10.0 | 0.0 | 3.0 | 2.0 | 5.0 | 4.0 |
Opp_2Pt% | 33.3 | NaN | NaN | NaN | 60.0 | NaN | 33.3 | 100.0 | 60.0 | 75.0 |
Opp_3ptM | 1.0 | 0.0 | 0.0 | 2.0 | 1.0 | 0.0 | 0.0 | 0.0 | 2.0 | 0.0 |
Opp_3PtA | 3.0 | 0.0 | 2.0 | 3.0 | 4.0 | 1.0 | 3.0 | 5.0 | 5.0 | 2.0 |
Opp_3Pt% | 33.3 | NaN | 0.0 | 66.7 | 25.0 | 0.0 | 0.0 | 0.0 | 40.0 | 0.0 |
Opp_FGM | 2.0 | 0.0 | 0.0 | 2.0 | 7.0 | 0.0 | 1.0 | 2.0 | 5.0 | 3.0 |
Opp_FGA | 6.0 | 0.0 | 2.0 | 3.0 | 14.0 | 1.0 | 6.0 | 7.0 | 10.0 | 6.0 |
Opp_FG% | 33.3 | NaN | 0.0 | 66.7 | 50.0 | 0.0 | 16.7 | 28.6 | 50.0 | 50.0 |
Opp_FTM | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 2.0 | 5.0 | 1.0 | 5.0 |
Opp_FTA | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 0.0 | 4.0 | 5.0 | 1.0 | 6.0 |
Opp_FT% | NaN | NaN | NaN | NaN | 50.0 | NaN | 50.0 | 100.0 | 100.0 | 83.3 |
Opp_Team | Australia | South Sudan | Japan | Slovenia | Mexico | Mexico | Japan | Lithuania | Montenegro | Slovenia |
Opp_is_win | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 1 |
Opp_is_starter | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 1 |