In [1]:
#import libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import datetime as dt
In [2]:
#load dataframe consisting of boxscores 
#or initialize if not available

#define date_parser
date_parser = lambda d: pd.to_datetime(d, format = '%Y-%m-%d')
#try to load
try:
    all_games = pd.read_csv('all_games_box_score.csv', 
                            index_col = 0, 
                            parse_dates = ['Date'],
                            date_format = '%Y-%m-%d'
                            )
    
    #transform Date column to only contain date
    all_games['Date'] = all_games.Date.dt.date
#initialize
except:
    all_games = pd.DataFrame()
    
all_games
Out[2]:

tableDataText function from stackoverflow user imbr

link: https://stackoverflow.com/questions/2935658/beautifulsoup-get-the-contents-of-a-specific-table

In [3]:
#Define functions needed to parse and create dataframe
def tableDataText(table):    
    """Parses a html segment started with tag <table> followed 
    by multiple <tr> (table rows) and inner <td> (table data) tags. 
    It returns a list of rows with inner columns. 
    Accepts only one <th> (table header/data) in the first row.
    """
    def rowgetDataText(tr, coltag='td'): # td (data) or th (header)       
        return [td.get_text(strip=False) for td in tr.find_all(coltag)]  
    rows = []
    trs = table.find_all('tr')
    headerow = rowgetDataText(trs[0], 'th')
    if headerow: # if there is a header row include first
        rows.append(headerow)
        trs = trs[1:]
    for tr in trs: # for every table row
        rows.append(rowgetDataText(tr, 'td') ) # data row       
    return rows

def change_minutes(minutes):
    """ Converts minutes from string format to float, then converts from
    mm:ss format to float format.
    Returns a float.
    """
    try:
        minutes = [float(t) for t in minutes.split(':')]
        return np.round(minutes[0] + (minutes[1] / 60), 2)
    #for players who did not play, we need to return a null value
    except:
        return None
    
def is_win(Team, Opponent):
    """This function will compare the dataframes
    of both teams to determine the winning team.
    Returns an int.
    """
    if Team.Pts.sum() > Opponent.Pts.sum():
        return 1
    else:
        return 0

def create_df(team, i):
    """Creates a dataframe after loading website data.
    Returns a dataframe.
    """
    
    #create a local variable of the teams soup
    teams_local = teams.copy()
    
    #get name of team from teams_local soup
    team_name = teams_local[i]
    
    #use tableDataText to extract table from team soup
    team_list = tableDataText(team)
    
    #create dataframe from team_list
    #discard rows with totals and coaches
    df = pd.DataFrame(team_list[1:],
                      columns = team_list[0]).replace({'\n' : ' ' , '%' : ''},
                                                      regex = True)[:-2]
    
    #remove whitespace from player names
    df['Players'] = df.Players.str.strip()
    
    #split up 2Pt column into makes, attempts, and percentage
    #repeat process for 3pt, FG, FT
    df[['2ptM', '2PtA']] = df['2Pts'].str.split('/',
                                                n = 1, 
                                                expand = True,
                                                regex = True)
    df[['2PtA', '2Pt%']] = df['2PtA'].str.split(' ', 
                                                n = 1, 
                                                expand = True,
                                                regex = True)
    df.drop('2Pts', axis = 1, inplace = True)

    df[['3ptM', '3PtA']] = df['3Pts'].str.split('/',
                                                n = 1,
                                                expand = True, 
                                                regex = True)
    df[['3PtA', '3Pt%']] = df['3PtA'].str.split(' ', 
                                                n = 1, 
                                                expand = True,
                                                regex = True)
    df.drop('3Pts', axis = 1, inplace = True)

    df[['FGM', 'FGA']] = df['FG'].str.split('/',
                                           n = 1,
                                           expand = True,
                                           regex = True)
    df[['FGA', 'FG%']] = df['FGA'].str.split(' ', 
                                             n = 1,
                                             expand = True,
                                             regex = True)
    df.drop('FG', axis = 1, inplace = True)

    df[['FTM', 'FTA']] = df['FT'].str.split('/',
                                           n = 1,
                                           expand = True,
                                           regex = True)
    df[['FTA', 'FT%']] = df['FTA'].str.split(' ', 
                                             n = 1,
                                             expand = True,
                                             regex = True)
    df.drop('FT', axis = 1, inplace = True)


    #convert min from mm:ss string format to float
    df['Min'] = df['Min'].apply(lambda x: change_minutes(x))
    
    #Convert numeric columns from string to float
    float_cols = list(df.columns.drop(['#', 'Players', 'Min']))
    df = df.replace('None', None)
    df[float_cols] = df[float_cols].apply(pd.to_numeric, 
                                          errors = 'coerce').astype('float')
    
    #include team name to dataframe
    df['Team'] = team_name
    
    return df
In [4]:
#define dictionary with game dates and url of boxscores
#we will inlcude all games for this year in case
#someone wants to try out the code

url_dict = {dt.date(2023, 8, 25) : ['https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/31e73809-4c71-4a56-bb9b-dc261d23b136',
                                   'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/1e128916-459f-4930-866b-543f92495685',
                                   'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/89cf255f-080b-4a22-b43e-3ff293b68037',
                                   'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/a0c9effd-2407-416d-ad49-259c8ac29e5d',
                                   'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/0f5d5e8c-603f-4ede-a8ab-e433f0be7167',
                                   'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/a6861ad5-55f1-4e51-a624-0f54092ebdcb',
                                   'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/cdd3ad9f-0d27-455d-8a25-3951d26181f4',
                                   'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/840cae94-aca7-40b4-935e-8fc4e82f807f'],
            
           dt.date(2023, 8, 26) : ['https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/39bb540e-ee1e-450b-be27-cf936e9299b2',
                                   'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/a3fad431-af47-4e21-84b5-f15783d76a22',
                                   'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/9c9dfa09-a5af-4be9-9058-b473b5bd33dc',
                                   'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/c87e8b52-75ea-40d3-9b51-87058c833957',
                                   'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/598bb5b0-6e68-41fc-910e-77b4238fcc97',
                                   'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/20751354-ba34-48d2-81f6-4be3fb02fba8',
                                   'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/cde478ad-3717-45fa-a264-52bf4dcae611',
                                   'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/0a891362-a31a-44a2-a06a-1dfe0579825c'],
            
           dt.date(2023, 8, 27) : ['https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/ec23be52-3379-441d-ae38-228b4d48188a',
                                   'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/30ba1588-ee1c-4436-8433-9cb13dbf30b3',
                                   'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/39e49762-096f-45d3-b5e0-0dd04b3c9838',
                                   'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/47f06d36-62cc-4965-8c58-7673bf30844b',
                                   'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/5f2fe4c6-46c0-4af5-98d1-ad9fbe6942bb',
                                   'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/596abe23-7993-46c5-b1ad-e85f15d7974f',
                                   'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/71956bce-97be-47dc-bfe8-d9adb45e66b9',
                                   'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/7dbb777a-8853-4c6d-ac16-27327b7882e8'],
            
           dt.date(2023, 8, 28) : ['https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/568e3c92-4549-4922-bf54-c77dd6290b8d',
                                    'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/6194ea1f-02b9-4699-9d0f-017fa3c81b00',
                                    'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/a6d46b94-7f63-4e05-9a7f-c9a031c7e2bc',
                                    'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/0d342700-4b2c-40f7-b692-42256693a10c',
                                    'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/dd1dcd25-80b6-411b-b3f7-64866e17295e',
                                    'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/e812d5c1-8e07-4021-bca4-526a32a19bf2',
                                    'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/93021800-7cd0-406a-a07b-053380116566',
                                    'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/7effc70f-9348-49a6-b92b-f23b36f751ec'],
           
           dt.date(2023, 8, 29) : ['https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/062985ae-bb35-4981-ade1-e51f2675ae51',
                                   'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/59fc340b-82fd-4869-8c58-79b9b39f9581',
                                   'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/37fbf32a-e08f-4026-b72b-6a7b9f3f1dbe',
                                   'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/cbecedb1-de1b-490b-ac43-f4c57a22f84c',
                                   'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/c089f75a-ab6a-4d0e-8a72-707e78121d37',
                                   'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/919b5db8-fc9f-4416-9a94-f1498c42c0f6',
                                   'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/1e04f909-3956-4184-989a-e60fc3a039da',
                                   'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/cbb618a7-1c67-4ea5-9b5a-053c71eeaa36'],
           
           dt.date(2023, 8, 30) : ['https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/fb3554af-0f3a-4ad4-84a8-810db71c7330',
                                   'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/ea102bf1-2235-4e04-93c5-236064376244',
                                   'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/fea657b9-1d62-4cb5-8b03-0935da45aee8',
                                   'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/ad6a9ea3-c9cb-4f62-9d41-69a791689431',
                                   'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/9b0e0f40-ba54-4376-8212-11d8a74eba89',
                                   'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/e8d24996-9432-49bb-833c-34e93c4fb11e',
                                   'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/0f896808-2ec8-45f2-9b4c-da81300aebf9',
                                   'https://www.fiba.basketball/en/Module/494c069c-5738-48c6-9c94-115693b87914/30ff5f55-1e86-4de7-ac3a-c67ea63e45e4']}
In [5]:
#create the dataframe that contains all boxscores

#initialize loop that will loop through the dictionary url_dict
for date, url_list in url_dict.items():
    #loop through the list of games during the date
    for url in url_list:
        #scrape data from url
        response = requests.get(url).text
        #initialize BeautifulSoup to scrape data
        soup = BeautifulSoup(response, 'html.parser')
        #box scores are separated by "team-name" class, separate the teams
        teams = [team.get_text() for team in soup.find_all('span', {'class' : 'team-name'})]
        team_A = soup.find_all('section')[0].find('table')
        team_B = soup.find_all('section')[1].find('table')
        
        #create dataframes for both teams
        A = create_df(team_A,0)
        B = create_df(team_B, 1)
        
        #create 'is_win' column
        A['is_win'] = is_win(A,B)
        B['is_win'] = is_win(B,A)
        
        #create 'Date' column
        A['Date'] = date
        B['Date'] = date
        
        #find starters and label
        starters = [starter.find('a').get_text().strip() for starter in soup.find_all('tr', {'class', 'x--player-is-starter'})]
        A['is_starter'] = A.Players.apply(lambda x: 1 if x in starters else 0)
        B['is_starter'] = B.Players.apply(lambda x: 1 if x in starters else 0)
        
        #extend dataframes to include opponent data
        match_A = pd.concat([A,B.add_prefix('Opp_')]
                              ,axis = 1).drop(['Opp_#']
                                              , axis = 1)

        match_B = pd.concat([B, A.add_prefix('Opp_')]
                              ,axis = 1).drop(['Opp_#']
                                              , axis = 1)
        

        #concatinate main dataframe with dataframes of both teams
        all_games = pd.concat([all_games, match_A, match_B]
                         ).drop_duplicates(subset = ['Players', 'Team', 'Opp_Team', 'Date']
                                          ).drop(['Opp_Date'], axis = 1)
In [6]:
#save dataframe to csv
all_games.to_csv('all_games_box_score.csv')
In [7]:
#show sample of resulting dataframe
all_games.sample(10).transpose()
Out[7]:
10 9 8 5 5 3 3 10 7 8
# 42 41 21 9 11 5 13 30 20 15
Players Andreas Obst Arnaldo Toro Barea Justus Hollatz Giorgi Shermadini Nemanja Radovic Amr El Gendy Olivier Nkamhoua Petar Popovic Donatas Motiejunas Kevin Coronel
Min 25.57 1.42 1.77 12.18 12.4 22.02 21.92 23.32 17.08 NaN
Pts 2.0 0.0 0.0 4.0 7.0 6.0 7.0 8.0 8.0 NaN
OREB 0.0 1.0 0.0 2.0 1.0 1.0 1.0 0.0 0.0 NaN
DREB 2.0 0.0 0.0 4.0 2.0 5.0 5.0 0.0 3.0 NaN
REB 2.0 1.0 0.0 6.0 3.0 6.0 6.0 0.0 3.0 NaN
AST 0.0 0.0 0.0 1.0 0.0 5.0 3.0 2.0 1.0 NaN
PF 1.0 0.0 0.0 0.0 3.0 1.0 1.0 1.0 3.0 NaN
TO 2.0 0.0 0.0 0.0 1.0 1.0 3.0 2.0 0.0 NaN
STL 0.0 0.0 0.0 0.0 0.0 0.0 2.0 1.0 1.0 NaN
BLK 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 NaN
+/- 2.0 -2.0 -2.0 -16.0 -8.0 12.0 -6.0 -8.0 19.0 NaN
EFF -4.0 0.0 -1.0 7.0 5.0 11.0 13.0 7.0 11.0 NaN
2ptM 0.0 0.0 0.0 1.0 2.0 2.0 2.0 3.0 2.0 NaN
2PtA 1.0 1.0 1.0 5.0 4.0 3.0 3.0 4.0 4.0 NaN
2Pt% 0.0 0.0 0.0 20.0 50.0 66.7 66.7 75.0 50.0 NaN
3ptM 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 1.0 NaN
3PtA 4.0 0.0 0.0 0.0 3.0 3.0 2.0 1.0 1.0 NaN
3Pt% 0.0 NaN NaN NaN 33.3 0.0 50.0 0.0 100.0 NaN
FGM 0.0 0.0 0.0 1.0 3.0 2.0 3.0 3.0 3.0 NaN
FGA 5.0 1.0 1.0 5.0 7.0 6.0 5.0 5.0 5.0 NaN
FG% 0.0 0.0 0.0 20.0 42.9 33.3 60.0 60.0 60.0 NaN
FTM 2.0 0.0 0.0 2.0 0.0 2.0 0.0 2.0 1.0 NaN
FTA 3.0 0.0 0.0 2.0 0.0 3.0 0.0 2.0 2.0 NaN
FT% 66.7 NaN NaN 100.0 NaN 66.7 NaN 100.0 50.0 NaN
Team Germany Puerto Rico Germany Georgia Montenegro Egypt Finland Montenegro Lithuania Cape Verde
is_win 1 1 1 0 1 1 0 0 1 0
Date 2023-08-27 2023-08-26 2023-08-25 2023-08-28 2023-08-25 2023-08-29 2023-08-27 2023-08-29 2023-08-29 2023-08-30
is_starter 1 0 0 1 0 0 1 1 0 0
Opp_Players Nick Kay Koch Bar Shuta Hara Gregor Hrovat Pako Cruz Jorge GutiƩrrez Yuta Watanabe Tomas Dimsa Marko Simonovic Bine Prepelic
Opp_Min 25.58 11.88 17.28 12.92 35.28 4.23 29.55 22.1 15.17 27.93
Opp_Pts 5.0 0.0 0.0 6.0 16.0 0.0 4.0 9.0 13.0 11.0
Opp_OREB 2.0 0.0 1.0 1.0 1.0 0.0 0.0 0.0 3.0 0.0
Opp_DREB 1.0 2.0 0.0 1.0 2.0 1.0 0.0 0.0 3.0 4.0
Opp_REB 3.0 2.0 1.0 2.0 3.0 1.0 0.0 0.0 6.0 4.0
Opp_AST 3.0 1.0 0.0 0.0 6.0 0.0 2.0 1.0 0.0 0.0
Opp_PF 3.0 1.0 0.0 3.0 2.0 1.0 1.0 2.0 1.0 1.0
Opp_TO 2.0 0.0 0.0 0.0 5.0 1.0 2.0 0.0 1.0 0.0
Opp_STL 1.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0
Opp_BLK 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 1.0 0.0
Opp_+/- -12.0 8.0 -6.0 17.0 -17.0 -15.0 -2.0 18.0 -6.0 18.0
Opp_EFF 6.0 4.0 -1.0 8.0 12.0 -1.0 -1.0 5.0 14.0 12.0
Opp_2ptM 1.0 0.0 0.0 0.0 6.0 0.0 1.0 2.0 3.0 3.0
Opp_2PtA 3.0 0.0 0.0 0.0 10.0 0.0 3.0 2.0 5.0 4.0
Opp_2Pt% 33.3 NaN NaN NaN 60.0 NaN 33.3 100.0 60.0 75.0
Opp_3ptM 1.0 0.0 0.0 2.0 1.0 0.0 0.0 0.0 2.0 0.0
Opp_3PtA 3.0 0.0 2.0 3.0 4.0 1.0 3.0 5.0 5.0 2.0
Opp_3Pt% 33.3 NaN 0.0 66.7 25.0 0.0 0.0 0.0 40.0 0.0
Opp_FGM 2.0 0.0 0.0 2.0 7.0 0.0 1.0 2.0 5.0 3.0
Opp_FGA 6.0 0.0 2.0 3.0 14.0 1.0 6.0 7.0 10.0 6.0
Opp_FG% 33.3 NaN 0.0 66.7 50.0 0.0 16.7 28.6 50.0 50.0
Opp_FTM 0.0 0.0 0.0 0.0 1.0 0.0 2.0 5.0 1.0 5.0
Opp_FTA 0.0 0.0 0.0 0.0 2.0 0.0 4.0 5.0 1.0 6.0
Opp_FT% NaN NaN NaN NaN 50.0 NaN 50.0 100.0 100.0 83.3
Opp_Team Australia South Sudan Japan Slovenia Mexico Mexico Japan Lithuania Montenegro Slovenia
Opp_is_win 0 0 0 1 0 0 1 1 0 1
Opp_is_starter 1 0 1 0 1 0 1 0 1 1