#remove warnings
import warnings
warnings.filterwarnings('ignore')
#import modules
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import folium

#import trees data
trees = pd.read_csv('data/trees.csv')
trees = trees.rename(columns = {'trunk_wire' : 'trnk_wire'})
trees.head()


def to_name(entry):
    words = entry.str.split()
    new_entry = ''
    for word in words:
        new_entry = ' '.join([new_entry, word])
        new_entry = new_entry.strip()
    return new_entry


trees.head()


#import neighborhoods geodata
neighborhoods = gpd.read_file('data/nta.shp')
neighborhoods.head()


#check trees dtypes and non-null count
for df in [trees, neighborhoods]:
	display(df.info(), df.isna().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64229 entries, 0 to 64228
Data columns (total 20 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   tree_id     64229 non-null  int64  
 1   tree_dbh    64229 non-null  int64  
 2   curb_loc    64229 non-null  object 
 3   spc_common  62428 non-null  object 
 4   status      64229 non-null  object 
 5   health      62427 non-null  object 
 6   root_stone  64229 non-null  object 
 7   root_grate  64229 non-null  object 
 8   root_other  64229 non-null  object 
 9   trnk_wire   64229 non-null  object 
 10  trnk_light  64229 non-null  object 
 11  trnk_other  64229 non-null  object 
 12  brch_light  64229 non-null  object 
 13  brch_shoe   64229 non-null  object 
 14  brch_other  64229 non-null  object 
 15  postcode    64229 non-null  int64  
 16  nta         64229 non-null  object 
 17  nta_name    64229 non-null  object 
 18  latitude    64229 non-null  float64
 19  longitude   64229 non-null  float64
dtypes: float64(2), int64(3), object(15)
memory usage: 9.8+ MB

None

tree_id          0
tree_dbh         0
curb_loc         0
spc_common    1801
status           0
health        1802
root_stone       0
root_grate       0
root_other       0
trnk_wire        0
trnk_light       0
trnk_other       0
brch_light       0
brch_shoe        0
brch_other       0
postcode         0
nta              0
nta_name         0
latitude         0
longitude        0
dtype: int64

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 195 entries, 0 to 194
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   borocode    195 non-null    float64 
 1   boroname    195 non-null    object  
 2   countyfips  195 non-null    object  
 3   ntacode     195 non-null    object  
 4   ntaname     195 non-null    object  
 5   shape_area  195 non-null    float64 
 6   shape_leng  195 non-null    float64 
 7   geometry    195 non-null    geometry
dtypes: float64(3), geometry(1), object(4)
memory usage: 12.3+ KB

None

borocode      0
boroname      0
countyfips    0
ntacode       0
ntaname       0
shape_area    0
shape_leng    0
geometry      0
dtype: int64


#initialize list with column of object and null data types
col_obj = list(trees.columns[trees.dtypes == 'object'])
col_num = list(trees.columns[(trees.dtypes != 'object') & (trees.dtypes != 'bool')])


#loop through object columns to explore entries
for col in col_obj:
    display(trees[col].value_counts(normalize = True))

OnCurb            0.933099
OffsetFromCurb    0.066901
Name: curb_loc, dtype: float64

honeylocust         0.211059
Callery pear        0.116887
ginkgo              0.093852
pin oak             0.073429
Sophora             0.071330
                      ...   
spruce              0.000016
Osage-orange        0.000016
red pine            0.000016
Persian ironwood    0.000016
smoketree           0.000016
Name: spc_common, Length: 128, dtype: float64

Alive    0.971944
Dead     0.028056
Name: status, dtype: float64

Good    0.758614
Fair    0.183574
Poor    0.057812
Name: health, dtype: float64

No     0.804201
Yes    0.195799
Name: root_stone, dtype: float64

No     0.961357
Yes    0.038643
Name: root_grate, dtype: float64

No     0.921889
Yes    0.078111
Name: root_other, dtype: float64

No     0.985723
Yes    0.014277
Name: trnk_wire, dtype: float64

No     0.994847
Yes    0.005153
Name: trnk_light, dtype: float64

No     0.913123
Yes    0.086877
Name: trnk_other, dtype: float64


#list columns to turn to bool
to_bool = ['root_stone', 'root_grate', 'root_other', 'trnk_wire', 'trnk_light', 'trnk_other', 'brch_light', 'brch_shoe', 'brch_other']

for col in to_bool:
	trees[col] = np.where(trees[col] == 'Yes', True, False)
#Convert columns to boolean, note that only trees with good health will be considered healthy
trees['alive'] = np.where(trees.status == 'Alive', True, False)
trees['healthy'] = np.where(trees.health == 'Good', True, False)
trees['on_curb'] = np.where(trees.curb_loc == 'OnCurb', True, False)

trees.sample(20)


trees.dtypes

tree_id         int64
tree_dbh        int64
curb_loc       object
spc_common     object
status         object
health         object
root_stone       bool
root_grate       bool
root_other       bool
trnk_wire        bool
trnk_light       bool
trnk_other       bool
brch_light       bool
brch_shoe        bool
brch_other       bool
postcode        int64
nta            object
nta_name       object
latitude      float64
longitude     float64
alive            bool
healthy          bool
on_curb          bool
dtype: object


#import missingno package
import missingno as msno

#create figure
fig, (ax1, ax2) = plt.subplots(2, 1, figsize = (10,8), tight_layout = True)
fig.suptitle('Null Values Details')

#create ax1 to find out if missing values are determined by tree_dbh
ax1 = msno.matrix(trees.sort_values('tree_dbh'), ax = ax1)

#create ax2 to find out if missing values are correlated
ax2 = msno.heatmap(trees, ax = ax2)

#show figure
plt.show()


#print out percentage of missing values
print('Percentage of missing values: ', round(100*(max(trees.isna().sum()) / trees.shape[0]),2), '%')

Percentage of missing values:  2.81 %


#check for dead trees
trees.query('status != "Alive"').isna().sum()

tree_id          0
tree_dbh         0
curb_loc         0
spc_common    1801
status           0
health        1802
root_stone       0
root_grate       0
root_other       0
trnk_wire        0
trnk_light       0
trnk_other       0
brch_light       0
brch_shoe        0
brch_other       0
postcode         0
nta              0
nta_name         0
latitude         0
longitude        0
alive            0
healthy          0
on_curb          0
dtype: int64


#replace null values on health with dead and turn it into ordinal data
trees['health'] = trees.health.fillna('Dead')
#initialize mapping
health_map = {'Dead' : 0, 'Poor' : 1, 'Fair' : 2, 'Good' : 3}
#map into ordinal data
trees['health'].replace(health_map, inplace = True)

#replace null values on species with unidentified
trees['spc_common'] = trees.spc_common.fillna('Unidentified')


#We will change the spc_common values to have proper nouns
species_capitalize = []
for entry in list(trees.spc_common.str.split()):
    new_entry = []
    for word in entry:
        new_entry.append(word.capitalize())
    to_enter = ' '.join(new_entry)
    to_enter = to_enter.strip()
    species_capitalize.append(to_enter)
trees['spc_common'] = species_capitalize

#check dataframe
trees.head()


#check for duplicated rows
trees.duplicated().sum()

0


#merge trees data with neighborhoods geodata
trees_neighborhoods = trees.merge(neighborhoods[['boroname', 'ntacode']], left_on = 'nta', right_on = 'ntacode', how = 'left')

#print out unique entries on boroname column
trees_neighborhoods.boroname.unique()

array(['Manhattan'], dtype=object)


#group trees by species, aggregate by count
trees_grouped_by_species = trees.query('status == "Alive"').groupby('spc_common').agg({'spc_common': 'count'})\
														.rename(columns = {'spc_common' : 'count'})\
														.reset_index()\
														.sort_values('count', ascending = False)


#create visual style
wood_color = '#340006'
sns.set_style('darkgrid', {'grid.color' : wood_color,
                           'axes.facecolor' : 'gray',
                           'figure.facecolor' : 'lightgray'})


#initialize figure
fig = plt.figure (figsize = (7,4))
#set figure title
fig.suptitle('Top 20 most common tree species in Manhattan', color = wood_color)

#create barplot
ax = sns.barplot(data = trees_grouped_by_species.nlargest(20, 'count'), y = 'spc_common', x = 'count', color = 'forestgreen')
ax.set_ylabel('')
ax.set_xlabel('Count')

#show plot
plt.show()


#create table grouped by neighborhood
trees_grouped_by_neighborhood = trees.groupby('nta').agg({'nta_name': 'count',
                                                               'tree_dbh' : 'mean',
                                                               'health' : 'mean',
                                                               'alive' : 'mean',
                                                              'on_curb' : 'mean',
                                                               'root_stone' :'mean',
                                                               'root_grate' : 'mean',
                                                               'root_other' : 'mean',
                                                               'trnk_wire' : 'mean',
                                                               'trnk_light' : 'mean',
                                                               'trnk_other' : 'mean',
                                                               'brch_light' : 'mean',
                                                               'brch_shoe' : 'mean',
                                                               'brch_other' : 'mean',
                                                              })\
                                                        .reset_index().rename(columns = {'nta_name' : 'trees_count', 'health' : 'average_health'})

#create table with neighborhood names and nta codes
nta_and_name = pd.DataFrame(trees.groupby('nta')['nta_name'].apply(lambda name: max(name)))

#join table with neighbborhood names
trees_grouped_by_neighborhood = nta_and_name.merge(trees_grouped_by_neighborhood, on = 'nta')


#subset neighborhoods table with only Manhattan nbhd entries
Manhattan_neighborhoods = neighborhoods[neighborhoods.ntacode.str.contains('MN')]
#create nbhd centers
Manhattan_neighborhoods['center_lng'] = Manhattan_neighborhoods.geometry.centroid.x
Manhattan_neighborhoods['center_lat'] = Manhattan_neighborhoods.geometry.centroid.y

#join with trees aggregated table, we now have a table with all information about each neighborhood and its aggregated tree data
Manhattan_neighborhoods_and_trees = Manhattan_neighborhoods.merge(trees_grouped_by_neighborhood,
                                                                  left_on = 'ntacode', right_on = 'nta', how = 'right')\
                                                            .drop(['nta', 'nta_name'], axis = 1)


#conversion from square meters to square kilometers
sqm_to_sqkm = 10**6

#create a copy of our main table, change epsg to have meter units
dense_find = Manhattan_neighborhoods_and_trees.copy()
dense_find['geometry'] = Manhattan_neighborhoods_and_trees.geometry.to_crs(epsg = 3857)
#transform shape area to have square kilometer units
dense_find['shape_area'] = dense_find.geometry.area / sqm_to_sqkm
#find density
dense_find['tree_density'] = dense_find['trees_count'] / dense_find['shape_area']

#add tree_density column to dataframe
Manhattan_neighborhoods_and_trees['tree_density'] = dense_find['tree_density']
#change area column values to square kilometer units
Manhattan_neighborhoods_and_trees['area'] = dense_find['shape_area']
#drop shape_area column
Manhattan_neighborhoods_and_trees.drop('shape_area', axis = 1, inplace = True)

#delete frame used to find density and area values
del dense_find


#ninitialize figure with one row and two columns
fig, ax = plt.subplots (1,2, figsize = (10,3), tight_layout = True)
fig.suptitle('Top Ten Neighborhoods With Most Trees', color = 'forestgreen')

#create barplot with neighborhoods with most trees
ax1 = sns.barplot(data = Manhattan_neighborhoods_and_trees.nlargest(10, 'trees_count'),
                  y = 'ntaname', x = 'trees_count',
                  color = 'forestgreen', ax = ax[0])
ax1.set_title('Top 10 Most trees', color = wood_color)
ax1.set_ylabel('')
ax1.set_xlabel('Number of Trees')

#create barplot with most dense neighborhoods
ax2 = sns.barplot(data = Manhattan_neighborhoods_and_trees.nlargest(10, 'tree_density'),
                  y = 'ntaname', x = 'tree_density',
                  color = 'forestgreen', ax = ax[1])
ax2.set_title('Top 10 Most trees per square kilometer', color = wood_color)
ax2.set_ylabel('')
ax2.set_xlabel('Trees per km\u00B2')

plt.show()


#check for intersection between the two charts above

#change the lists to sets
top_ten_most_dense_nbhd = set(Manhattan_neighborhoods_and_trees.nlargest(10, 'tree_density').ntaname)
top_ten_most_trees_nbhd = set(Manhattan_neighborhoods_and_trees.nlargest(10, 'trees_count').ntaname)

#find intersection
intersection = list(top_ten_most_dense_nbhd.intersection(top_ten_most_trees_nbhd))

#print intersection
print("The Neighborhoods appears on both lists are: " , ', '.join(intersection))

The Neighborhoods appears on both lists are:  Central Harlem North-Polo Grounds, Central Harlem South, West Village, Upper West Side, Upper East Side-Carnegie Hill, Morningside Heights


#import pearson r score
from scipy.stats import pearsonr

#create new column to show if neighborhood is ranked with most number of trees or most dense with trees
#initialize empty list
nbhd_classification = []
#loop through neighborhoods
for nbhd in Manhattan_neighborhoods_and_trees['ntaname']:
    if nbhd in intersection:
        nbhd_classification.append('Top 10 Both')
    elif nbhd in top_ten_most_trees_nbhd:
        nbhd_classification.append('Top 10 Most Trees')
    elif nbhd in top_ten_most_dense_nbhd:
        nbhd_classification.append('Top 10 Most Dense')
    else:
        nbhd_classification.append('Not Ranked')

#insert new column
Manhattan_neighborhoods_and_trees['nbhd_tree_count_classification'] = nbhd_classification

#find Pearson r correlation score
r_score = pearsonr(Manhattan_neighborhoods_and_trees.area, Manhattan_neighborhoods_and_trees.trees_count)
leg_kwds={
'loc': 'upper left',
'bbox_to_anchor':(1, 1.03)}

#initialize figure
fig, ax = plt.subplots(1,1, figsize = (10,5))
fig.suptitle('Trees in Manhattan per square kilometer', color = 'forestgreen')

#create scatterplot
ax = sns.scatterplot(data = Manhattan_neighborhoods_and_trees,x = 'area', y = 'trees_count',
                hue = 'nbhd_tree_count_classification', palette = 'Set1',
                legend = True)

#create regplot, hide scatter points to show plot above, set ci = None to show only best fit line
ax = sns.regplot(data = Manhattan_neighborhoods_and_trees,x = 'area', y = 'trees_count', 
                 scatter_kws = {'alpha' : 0}, line_kws = {'color' : 'yellow', 'alpha' : 0.5}, ci = None)

#set title and labels
ax.set_title('Hued by Ranking', color = wood_color)
ax.set_xlabel('Area (km\u00B2)', color = wood_color)
ax.set_ylabel('Number of Trees', color = wood_color)
ax.text(5.05, 4000, f'Pearson r = {r_score[0].round(2)}', color = 'white')

#position legend outside plot
ax.legend(bbox_to_anchor = (-0.7,1),loc = 'upper left')

plt.legend(bbox_to_anchor = (1, 1.03), loc = 'upper left')
plt.show()


#add up the values of the problem on trees
harm_rating = Manhattan_neighborhoods_and_trees[to_bool]
Manhattan_neighborhoods_and_trees['overall_problem'] = harm_rating.sum(axis = 1)

#initialize green palette
green_pal = sns.light_palette('green', as_cmap = True)

#initialize figure with 1 row and 2 columns
fig, ax = plt.subplots(1,2, figsize = (10,5), sharey = True)
fig.suptitle('Trees in Manhattan per Square Kilometer', color = 'forestgreen')

#create 1st plot
ax1 = sns.scatterplot(data = Manhattan_neighborhoods_and_trees,x = 'area', y = 'trees_count',
                hue = 'nbhd_tree_count_classification', palette = 'Set1',
                legend = True, ax = ax[0])
ax1.legend(bbox_to_anchor = (-0.75,1),loc = 'upper left')
ax1.set_title('Hued by Ranking', color = wood_color)
ax1.set_xlabel('Area (km\u00B2)')
ax1.legend(bbox_to_anchor = (-0.1, 1.03), loc = 'upper right')

#create temporary dataframe with average tree problems for each neighborhood
harm_rating = Manhattan_neighborhoods_and_trees[to_bool]
#add values for each row, we now have average overall problem for each tree for each neighborhood
Manhattan_neighborhoods_and_trees['overall_problem'] = harm_rating.sum(axis = 1)

#initialize light green palette
green_pal = sns.light_palette('forestgreen', as_cmap = True)

#create second plot
ax2 = sns.scatterplot(data = Manhattan_neighborhoods_and_trees, x = 'area', y = 'trees_count',
                hue = 'average_health', palette = green_pal,
                size = 'overall_problem', ax = ax[1])
ax2.set_title('Hued by Health, Sized by Overall Problems')
ax2.set_xlabel('Area (km\u00B2)')
ax2.legend(bbox_to_anchor = (1,1.03))

#show plots
plt.show()


#import folium
import folium

#find center of Manhattan, we need this to center our map
centerpoint = [np.median(trees.latitude) + .01, np.median(trees.longitude)- .01]

#initialize folium map
Manhattan = folium.Map(location = centerpoint, 
                       zoom_start = 11.5, 
                       width = 700, height = 800,
                       zoom_control=False,
                       scrollWheelZoom=False,
                       dragging=False,
                      legend = 'Manhattan',
                      tiles = 'cartodbpositron')
 
#create base map for initial view of map
base_map = folium.FeatureGroup(name = 'base', overlay = False, control = False).add_to(Manhattan)
folium.TileLayer(tiles = 'cartodbpositron').add_to(base_map)
base_map.add_to(Manhattan)

#create popups that show neighborhood name, and number of trees
for lab, row in Manhattan_neighborhoods_and_trees.iterrows():
    location = [row.center_lat, row.center_lng]
    popup = row.ntaname + ', trees: ' + str(row.trees_count)
    folium.Marker(location = location, popup = popup).add_to(base_map)


#create choropleth layer based on Trees per Square kilometer
layer1 = folium.Choropleth(geo_data = Manhattan_neighborhoods_and_trees,
                     name = 'Trees per Square Kilometer', 
                     data = Manhattan_neighborhoods_and_trees,
                     columns = ['ntaname', 'tree_density'],
                    key_on = 'feature.properties.ntaname',
                    fill_color = 'BuGn', fill_opacity = 20,
                    legend_name = 'Trees per square kilometer by neighborhood', overlay = False).add_to(Manhattan)

#create choropleth layer based on number of trees
layer2 = folium.Choropleth(geo_data = Manhattan_neighborhoods_and_trees,
                     name = 'Tree Count', 
                     data = Manhattan_neighborhoods_and_trees,
                     columns = ['ntaname', 'trees_count'],
                    key_on = 'feature.properties.ntaname',
                    fill_color = 'BuGn', fill_opacity = 20,
                    legend_name = 'Trees by neighborhood', overlay = False).add_to(Manhattan)

#create choropleth layer based on average tree diameter
layer3 =  Manhattan.choropleth(geo_data = Manhattan_neighborhoods_and_trees,
                     name = 'Average Tree Diameter', 
                     data = Manhattan_neighborhoods_and_trees,
                     columns = ['ntaname', 'tree_dbh'],
                    key_on = 'feature.properties.ntaname',
                    fill_color = 'BuGn', fill_opacity = 20,
                    legend_name = 'Average Tree Diameter', overlay = False)

#create choropleth layer based on tree health
layer4 =  Manhattan.choropleth(geo_data = Manhattan_neighborhoods_and_trees,
                     name = 'Average Tree Health', 
                     data = Manhattan_neighborhoods_and_trees,
                     columns = ['ntaname', 'average_health'],
                    key_on = 'feature.properties.ntaname',
                    fill_color = 'BuGn', fill_opacity = 20,
                    legend_name = 'Average Tree Health', overlay = False)

#create choropleth layer based on average tree problems
harm_layer =  folium.Choropleth(geo_data = Manhattan_neighborhoods_and_trees,
                     name = 'Average Problem per Tree', 
                     data = Manhattan_neighborhoods_and_trees,
                     columns = ['ntaname', 'overall_problem'],
                    key_on = 'feature.properties.ntaname',
                    fill_color = 'PuRd', fill_opacity = 20,
                    legend_name = 'Average Problem per Tree', overlay = False, legend = False).add_to(Manhattan)

#create choropleth layer based on percentege of trees planted on curb
curb_layer =  folium.Choropleth(geo_data = Manhattan_neighborhoods_and_trees,
                     name = 'Average on curb rate', 
                     data = Manhattan_neighborhoods_and_trees,
                     columns = ['ntaname', 'on_curb'],
                    key_on = 'feature.properties.ntaname',
                    fill_color = 'Greys', fill_opacity = 20,
                    legend_name = 'Average on curb rate', overlay = False, legend = False).add_to(Manhattan)

#add in layer control for view control
folium.LayerControl(collapsed = False, position = 'topright').add_to(Manhattan)

#display folium map
Manhattan


#group trees by neighborhood, then species, filter to neighborhoods in most dense ranking
trees_grouped_by_nbhd = trees[trees.nta_name.isin(top_ten_most_dense_nbhd)].groupby(['nta_name', 'spc_common'])\
                                .agg({'spc_common' : 'count'})\
                                .rename(columns = {'spc_common' : 'count'}).sort_values('count', ascending = False)

#find the top 5 most prevalent trees in each neighborhood
trees_grouped_by_nbhd_most_common_species = trees_grouped_by_nbhd.groupby(level = 'nta_name').head(5).sort_values(['nta_name', 'count'], ascending = [True, False])

#get species names, input to list
most_common_trees_in_most_densed_nbhd = list(trees_grouped_by_nbhd_most_common_species.reset_index().spc_common.unique())

#show dataframe
trees_grouped_by_nbhd_most_common_species


#create overall_harm for each tree
trees['overall_harm'] = trees[to_bool].sum(axis = 1)

#create dataframe of trees grouped by species
trees_by_species = trees.groupby('spc_common').agg({'tree_dbh' : 'mean',
                                                    'spc_common' : 'count',
                                                    'tree_dbh' : 'mean',
                                                    'health' : 'mean',
                                                    'overall_harm' : 'mean',
                                                    'on_curb' : 'mean'})

#rename spc_common to species
trees_by_species.index.rename('species', inplace = True)
#rename spc_common columns that contatins the count of the species with count
trees_by_species.rename(columns = {'spc_common' : 'count'}, inplace = True)


#create red_pal
red_pal = sns.light_palette('red', as_cmap = True)

#We will pick trees that have count more than 15 to ensure that the trees recommended are not rare
sns.scatterplot(data = trees_by_species.query('count > 15'), x = 'tree_dbh' , y = 'health',
                hue = 'overall_harm', palette = red_pal)
plt.title('Species with count more than 20', color = 'forestgreen')
plt.show()


#import termcolor, so we can color the trees we recommend
from termcolor import colored

#filter trees_by_species to show trees that have the specs that we need, get 10 largest in terms of trunk diameter.
trees_to_plant = trees_by_species.query('(count > 15) and (health > 2.5) and (tree_dbh > 7) and (overall_harm > 0.4) and (on_curb > 0.5)').nlargest(10, 'tree_dbh')

#show recommendations
print(colored("The trees we recommend that the city should plant are:",'green'))
print(colored(', '.join(list(trees_to_plant.index)), 'green'))

The trees we recommend that the city should plant are:
American Elm, London Planetree, Siberian Elm, Ohio Buckeye, Tree Of Heaven, Mulberry, Willow Oak, Pin Oak, Black Locust, Black Walnut


#show recommendations with their features
trees_to_plant = trees_to_plant.round(2)
trees_to_plant['label'] = 'recommended'
trees_to_plant


#most common trees on nbhds that are most densed with trees
trees_0 = trees_by_species[trees_by_species.index.isin(most_common_trees_in_most_densed_nbhd)].round(2)
trees_0['label'] = 'on most dense'

#show most prevalent trees with their features
trees_0


#show trees that appeared on both recommendation and most prevalent on most dense nbhds
print(f'The trees that we recommended that are most prevalent on the neighborhoods with most trees are ', ', '.join(list(set(trees_to_plant.index).intersection(set(trees_0.index)))))

The trees that we recommended that are most prevalent on the neighborhoods with most trees are  London Planetree, American Elm, Pin Oak


#concatened both tables, this is to make swarmplots
joined = pd.concat([trees_to_plant, trees_0])

#initiate figure
fig, ax = plt.subplots(1, 5, figsize = (15,5))

#set title
fig.suptitle('Comparison of Recommended to Prevalent Trees', color = wood_color)

#loop through columns to make swarmplots
for i, col in enumerate(['tree_dbh', 'count', 'health', 'overall_harm', 'on_curb']):
    ax[i] = sns.swarmplot(data = joined, y = col, x = 'label', ax = ax[i], palette = ['limegreen', wood_color])
    ax[i].set_title(col)
    ax[i].set_xlabel('')
    ax[i].set_ylabel('')

#show plot
plt.show()


print(colored("The trees we recommend that the city should plant are:",'green'))
print(colored(', '.join(list(trees_to_plant.index)), color = 'green'))

The trees we recommend that the city should plant are:
American Elm, London Planetree, Siberian Elm, Ohio Buckeye, Tree Of Heaven, Mulberry, Willow Oak, Pin Oak, Black Locust, Black Walnut

	borocode	boroname	countyfips	ntacode	ntaname	shape_area	shape_leng	geometry
0	3.0	Brooklyn	047	BK43	Midwood	3.579964e+07	27996.591274	POLYGON ((-73.94733 40.62917, -73.94687 40.626...
1	3.0	Brooklyn	047	BK75	Bedford	3.262983e+07	29992.919174	POLYGON ((-73.94193 40.70073, -73.94439 40.700...
2	2.0	Bronx	005	BX40	Fordham South	6.307284e+06	15878.272921	POLYGON ((-73.89138 40.86170, -73.89106 40.861...
3	3.0	Brooklyn	047	BK88	Borough Park	5.400502e+07	39247.227722	POLYGON ((-73.97605 40.63128, -73.97717 40.630...
4	3.0	Brooklyn	047	BK96	Rugby-Remsen Village	3.270695e+07	30957.853395	POLYGON ((-73.90856 40.65210, -73.90945 40.651...

		count
nta_name	spc_common
Central Harlem North-Polo Grounds	Honeylocust	640
	Callery Pear	451
	Pin Oak	273
	Sophora	269
	London Planetree	265
Central Harlem South	Honeylocust	441
	Ginkgo	280
	London Planetree	252
	Callery Pear	234
	Japanese Zelkova	196
East Village	Honeylocust	322
	Callery Pear	197
	Sophora	150
	Ginkgo	143
	Littleleaf Linden	101
Gramercy	Honeylocust	231
	Ginkgo	185
	Callery Pear	166
	Sophora	94
	London Planetree	91
Hamilton Heights	Pin Oak	426
	Honeylocust	385
	Sophora	166
	London Planetree	140
	Callery Pear	127
Morningside Heights	Honeylocust	391
	Ginkgo	371
	American Elm	296
	Callery Pear	232
	Pin Oak	225
Upper East Side-Carnegie Hill	Callery Pear	828
	Honeylocust	725
	Ginkgo	569
	American Elm	390
	Littleleaf Linden	286
Upper West Side	Honeylocust	972
	Pin Oak	809
	Ginkgo	533
	Sophora	492
	Callery Pear	462
West Village	Callery Pear	568
	Honeylocust	560
	Ginkgo	535
	Sophora	280
	London Planetree	246
Yorkville	Honeylocust	680
	Callery Pear	286
	Ginkgo	188
	Littleleaf Linden	147
	Sophora	129

	tree_dbh	count	health	overall_harm	on_curb	label
species
American Elm	13.90	1698	2.76	0.43	0.72	recommended
London Planetree	13.17	4122	2.54	0.48	0.83	recommended
Siberian Elm	12.06	156	2.79	0.42	0.92	recommended
Ohio Buckeye	11.96	24	2.71	0.88	0.96	recommended
Tree Of Heaven	11.45	104	2.74	0.78	0.96	recommended
Mulberry	11.00	68	2.63	0.46	0.88	recommended
Willow Oak	10.81	889	2.81	0.59	0.97	recommended
Pin Oak	10.07	4584	2.78	0.47	0.91	recommended
Black Locust	9.77	259	2.75	0.54	0.98	recommended
Black Walnut	9.64	33	2.76	0.58	1.00	recommended

	tree_dbh	count	health	overall_harm	on_curb	label
species
American Elm	13.90	1698	2.76	0.43	0.72	on most dense
Callery Pear	8.68	7297	2.68	0.64	0.97	on most dense
Ginkgo	8.45	5859	2.66	0.56	0.96	on most dense
Honeylocust	9.06	13176	2.82	0.70	0.95	on most dense
Japanese Zelkova	7.04	3596	2.71	0.51	0.95	on most dense
Littleleaf Linden	7.87	3333	2.48	0.55	0.94	on most dense
London Planetree	13.17	4122	2.54	0.48	0.83	on most dense
Pin Oak	10.07	4584	2.78	0.47	0.91	on most dense
Sophora	9.23	4453	2.76	0.57	0.96	on most dense

Table of Contents¶

Which tree species should the city plant?¶

📖 Background¶

💾 The data¶

Tree Census¶

Neighborhoods' geographical information¶

Data Cleansing¶

Explore column types and entries¶

Convert data types and transform entries¶

Cleaning null values¶

Check for duplicates¶

Analysis¶

What are the most common tree species in Manhattan?¶

Which neighborhoods has the most trees?¶

Create aggregated table with details about each Manhattan neighborhood¶

Plot neighborhoods tree counts¶

A visualization of Manhattan's neighborhoods and trees¶

Ten species recommendation¶

Which species are most common on neighborhoods most densed with trees?¶

Recommended Trees¶

Comparing recommended trees to trees most prevalent in most densed neighborhoods¶

Reccomendations:¶

	tree_id	tree_dbh	curb_loc	spc_common	status	health	root_stone	root_grate	root_other	trnk_wire	trnk_light	trnk_other	brch_light	brch_shoe	brch_other	postcode	nta	nta_name	latitude	longitude
0	190422	11	OnCurb	honeylocust	Alive	Good	No	No	No	No	No	No	No	No	No	10023	MN14	Lincoln Square	40.770046	-73.984950
1	190426	11	OnCurb	honeylocust	Alive	Good	No	No	No	No	No	No	No	No	No	10023	MN14	Lincoln Square	40.770210	-73.985338
2	208649	9	OnCurb	American linden	Alive	Good	No	Yes	No	No	No	No	No	No	No	10019	MN15	Clinton	40.762724	-73.987297
3	193310	14	OnCurb	honeylocust	Alive	Good	No	No	Yes	No	No	Yes	No	No	Yes	10021	MN40	Upper East Side-Carnegie Hill	40.772171	-73.960456
4	199760	4	OnCurb	willow oak	Alive	Fair	No	No	No	No	No	No	No	No	No	10024	MN12	Upper West Side	40.782087	-73.980964

	tree_id	tree_dbh	curb_loc	spc_common	status	health	root_stone	root_grate	root_other	trnk_wire	...	brch_shoe	brch_other	postcode	nta	nta_name	latitude	longitude	alive	healthy	on_curb
2163	196865	3	OnCurb	cherry	Alive	Good	False	False	False	False	...	False	False	10002	MN27	Chinatown	40.717069	-73.991512	True	True	True
63624	189960	4	OnCurb	Sophora	Alive	Good	False	False	False	False	...	False	False	10065	MN31	Lenox Hill-Roosevelt Island	40.763563	-73.963746	True	True	True
40283	70926	11	OnCurb	honeylocust	Alive	Good	True	False	False	False	...	False	False	10013	MN24	SoHo-TriBeCa-Civic Center-Little Italy	40.725996	-74.008768	True	True	True
31314	533678	10	OnCurb	honeylocust	Alive	Good	False	False	False	False	...	False	False	10019	MN15	Clinton	40.764134	-73.990354	True	True	True
63924	203888	7	OnCurb	Japanese zelkova	Alive	Fair	True	False	False	False	...	False	False	10025	MN09	Morningside Heights	40.805910	-73.961159	True	False	True
43542	49341	19	OnCurb	honeylocust	Alive	Good	True	False	False	False	...	False	False	10011	MN23	West Village	40.734688	-73.999618	True	True	True
10845	162819	3	OnCurb	serviceberry	Alive	Good	False	False	False	False	...	False	False	10009	MN28	Lower East Side	40.725483	-73.978590	True	True	True
38716	65913	17	OnCurb	Chinese elm	Alive	Good	False	False	False	False	...	False	False	10031	MN04	Hamilton Heights	40.821959	-73.948276	True	True	True
14186	255690	7	OnCurb	Japanese zelkova	Alive	Good	False	False	False	False	...	False	False	10040	MN35	Washington Heights North	40.855276	-73.930711	True	True	True
3088	171661	16	OnCurb	Sophora	Alive	Good	False	False	False	False	...	False	False	10009	MN28	Lower East Side	40.727591	-73.976396	True	True	True
21687	296601	8	OnCurb	crimson king maple	Alive	Fair	False	False	False	False	...	False	True	10021	MN40	Upper East Side-Carnegie Hill	40.768693	-73.964001	True	False	True
50049	107416	6	OnCurb	Japanese zelkova	Alive	Good	False	False	False	False	...	False	False	10034	MN01	Marble Hill-Inwood	40.865816	-73.922447	True	True	True
12330	278478	5	OnCurb	ginkgo	Alive	Good	True	False	False	False	...	False	False	10128	MN32	Yorkville	40.778958	-73.948196	True	True	True
23989	363310	4	OnCurb	Chinese elm	Alive	Good	False	False	False	False	...	False	False	10032	MN36	Washington Heights South	40.844657	-73.940615	True	True	True
37033	43604	16	OnCurb	honeylocust	Alive	Good	True	False	True	False	...	False	False	10027	MN34	East Harlem North	40.804893	-73.941464	True	True	True
33906	27732	9	OnCurb	ginkgo	Alive	Good	False	True	False	False	...	False	False	10011	MN13	Hudson Yards-Chelsea-Flatiron-Union Square	40.741478	-73.997205	True	True	True
20760	310333	10	OnCurb	ginkgo	Alive	Fair	False	False	False	False	...	False	False	10036	MN15	Clinton	40.760222	-73.989072	True	False	True
9361	241442	11	OnCurb	London planetree	Alive	Fair	False	False	False	False	...	False	False	10016	MN20	Murray Hill-Kips Bay	40.743966	-73.977797	True	False	True
31611	537302	18	OnCurb	willow oak	Alive	Good	True	False	True	False	...	False	False	10024	MN12	Upper West Side	40.780991	-73.979152	True	True	True
60550	181097	5	OnCurb	Japanese zelkova	Alive	Fair	False	False	False	False	...	False	True	10040	MN35	Washington Heights North	40.864522	-73.927832	True	False	True