Analyzing Tree Canopy and Gentrification Changes

Yesh Khanna
Aug 8, 2024
5 min read

Updated: Aug 9, 2024

Over the summer I have been a part of Temple’s Spatial Analytics Lab, assisting Dr. Daniel Wiese’s American Cancer Society research on the correlation between tree canopy cover changes and gentrification trends in metropolitan environments. This research is a part of a larger study examining the effects of greenspaces on post-treatment recovery in cancer patients. My primary task is to automate the extraction of the change in tree canopy cover over the years and combine it with gentrification trends to produce spatial and statistical models. This post briefly walks through the key steps applied to the process and then discusses potential for improvement.

Tree canopy cover change was measured over a ten-year period using NLCD’s datasets for 2011 and 2021. Gentrification data was provided by Dr. Wiese. The first step is to extract and calculate tree canopy cover changes over the years for a particular state based on user input (Pennsylvania is used as an example for this post), and export that data as an Excel file.

Click on the arrow to view script

import geopandas as gpd

import rasterio

from rasterio.mask import mask

import numpy as np

import matplotlib.pyplot as plt

import pandas as pd

from shapely.geometry import box

def extract_tracts_for_state(shapefile_path, state_name):

    # Read the shapefile

    gdf = gpd.read_file(shapefile_path)

    # Filter the GeoDataFrame to include only the specified state

    state_gdf = gdf[gdf['STATE_NAME'] == state_name]

    return state_gdf

def calculate_average_raster_values(raster_paths, tracts_gdf):

    for year, raster_path in raster_paths.items():

        average_values = []

        with rasterio.open(raster_path) as src:

            raster_bounds = box(*src.bounds)

            for tract in tracts_gdf.geometry:

                # Check if the tract overlaps with the raster bounds

                if not tract.intersects(raster_bounds):

                    average_values.append(np.nan)

                    continue

                # Mask the raster with the current tract

                try:

                    out_image, out_transform = mask(src, [tract], crop=True)

                    # Calculate the average value of the masked area

                    if out_image.size > 0:

                        average_value = np.mean(out_image[out_image > 0])  # ignoring no-data values

                    else:

                        average_value = np.nan

                    average_values.append(average_value)

                except ValueError:

                    # Handle the case where the tract does not overlap the raster

                    average_values.append(np.nan)

        # Add the average values for the current year to the GeoDataFrame

        tracts_gdf[f'average_raster_value_{year}'] = average_values

    return tracts_gdf

def calculate_raster_value_changes(tracts_gdf):

    tracts_gdf['change_2011_2018'] = tracts_gdf['average_raster_value_2018'] - tracts_gdf['average_raster_value_2011']

    tracts_gdf['change_2018_2021'] = tracts_gdf['average_raster_value_2021'] - tracts_gdf['average_raster_value_2018']

    tracts_gdf['change_2011_2021'] = tracts_gdf['average_raster_value_2021'] - tracts_gdf['average_raster_value_2011']

    return tracts_gdf

def plot_choropleth_maps(tracts_gdf, columns, state_name):

    fig, axes = plt.subplots(1, len(columns), figsize=(20, 10))

    for ax, column in zip(axes, columns):

        tracts_gdf.plot(column=column, ax=ax, legend=True,

                        legend_kwds={'label': column,

                                     'orientation': "horizontal"},

                        cmap='viridis')

        ax.set_title(f'{state_name} - {column}')

        ax.set_xlabel('Longitude')

        ax.set_ylabel('Latitude')

    plt.tight_layout()

    plt.show()

if __name__ == "__main__":

    # Replace 'path_to_shapefile.shp' with the path to your shapefile

    shapefile_path = 'data/tracts.shp'

    # Dictionary of raster files with their respective years

    raster_paths = {

        2011: 'data/nlcd_tcc_conus_2011_v2021-4.tif',

        2018: 'data/nlcd_tcc_conus_2018_v2021-4.tif',

        2021: 'data/nlcd_tcc_conus_2021_v2021-4.tif'

    # Take state name as user input

    state_name = input("Enter the state name: ")

    # Extract tracts for the specified state

    tracts_gdf = extract_tracts_for_state(shapefile_path, state_name)

    # Calculate the average raster values for each year

    tracts_with_avg_values = calculate_average_raster_values(raster_paths, tracts_gdf)

    # Calculate the raster value changes

    tracts_with_changes = calculate_raster_value_changes(tracts_with_avg_values)

    # Display the GeoDataFrame as a table

    print(tracts_with_changes.head())  # Display the first few rows of the GeoDataFrame

    # Uncomment the line below to display the entire GeoDataFrame

    # print(tracts_with_changes.to_string())

    # Plot choropleth maps for all changes

    plot_choropleth_maps(tracts_with_changes, ['change_2011_2018', 'change_2018_2021', 'change_2011_2021'], state_name)

    # Export the updated GeoDataFrame to an Excel file

    tracts_with_changes.to_excel(f'{state_name}_tracts_with_avg_values_and_changesblog.xlsx', index=False)

    print(f"GeoDataFrame exported as {state_name}_tracts_with_avg_values_and_changes.xlsx")

The above script masks the raster files to the shapefile of the state and calculates average values for each tract for the given years. It then proceeds to calculate the value changes over the years and outputs them in the form of an Excel file and series of maps.

This dataset is then joined with the gentrification data using an inner join on the 'GEOID' field, and Philadelphia metro areas are filtered to create and export a new dataset containing tree canopy cover change and gentrification data.

Click on the arrow to view script

#%%

import pandas as pd

# Load the Excel file

pa_tracts_df = pd.read_excel('Pennsylvania_tracts_with_avg_values_and_changes.xlsx')

# Load the CSV file

metro_usa_df = pd.read_csv('gen_metro_usa.csv')

#%%

# Display the first few rows of each dataframe

print(pa_tracts_df.head())

print(metro_usa_df.head())

# Check the columns and data types

print(pa_tracts_df.info())

print(metro_usa_df.info())

#%%

merged_df = pd.merge(pa_tracts_df, metro_usa_df, on='GEOID', how='inner')

# Display the first few rows of the merged dataframe

print(merged_df.head())

# Summary statistics

print(merged_df.describe())

# Check for missing values

print(merged_df.isnull().sum())

#%%

# Filter the merged dataframe for rows where the 'Metro' field is 37980

filtered_df = merged_df[merged_df['Metro'] == 37980]

# Display the first few rows of the filtered dataframe

print(filtered_df.head())

# Optionally, check the size of the filtered dataframe to ensure the filtering worked as expected

print(filtered_df.shape)

#%%%

PhillyTCCGent = filtered_df

PhillyTCCGent.to_csv('PhillyTCCGent.csv', index=False)

The resulting dataset is used to run spatial and statistical analyses. It is loaded into a GeoDataFrame for spatial analysis. The script calculates summary statistics (mean, count, standard deviation) for greenspace changes in different years (2011, 2018, 2021) based on gentrification status. The code compiles the statistics for all years into a single DataFrame and exports it to an Excel file. A map is plotted to visualize greenspace changes across Philadelphia, with different colors representing various gentrification statuses. Finally, the code runs a regression analysis to model the relationship between greenspace changes and gentrification status, treating the gentrification status as a categorical variable.

Click on the arrow to view script to run the analyses

import pandas as pd

import geopandas as gpd

from shapely import wkt

import numpy as np

import scipy.stats as stats

import matplotlib.pyplot as plt

from matplotlib.patches import Patch

import seaborn as sns

import statsmodels.api as sm

import statsmodels.formula.api as smf

def load_data(filepath):

    """Load the data from CSV and convert to GeoDataFrame."""

    data = pd.read_csv(filepath)

    data['geometry'] = data['geometry'].apply(wkt.loads)

    data_gdf = gpd.GeoDataFrame(data, geometry='geometry')

    return data_gdf

def calculate_statistics(df, year):

    """Calculate statistics for a specific year and return a summary DataFrame."""

    groups = df.groupby('gen_stat_fin')[f'average_raster_value_{year}']

    stats_summary = groups.agg(['mean', 'count', 'std'])

    ci95 = stats_summary.apply(lambda row: stats.t.interval(0.95, row['count'] - 1, loc=row['mean'], scale=row['std'] / np.sqrt(row['count'])) if row['count'] > 1 else (np.nan, np.nan), axis=1)

    stats_summary['ci_lower'], stats_summary['ci_upper'] = zip(*ci95)

    stats_summary = stats_summary.reset_index()

    stats_summary['year'] = year

    return stats_summary

def compile_statistics(df):

    """Compile statistics for multiple years and combine them into a single DataFrame."""

    stats_2011 = calculate_statistics(df, 2011)

    stats_2018 = calculate_statistics(df, 2018)

    stats_2021 = calculate_statistics(df, 2021)

    all_stats = pd.concat([stats_2011, stats_2018, stats_2021])

    all_stats['Metro_area'] = 'Philadelphia'

    all_stats = all_stats[['Metro_area', 'gen_stat_fin', 'year', 'mean', 'std', 'ci_lower', 'ci_upper', 'count']]

    return all_stats

def export_statistics(stats_df, output_path):

    """Export the statistics DataFrame to an Excel file."""

    stats_df.to_excel(output_path, index=False)

    # print(f'Statistics exported to {output_path}')

def plot_map(df, color_palette):

    """Plot the map showing greenspace changes and gentrification status."""

    df['color'] = df['gen_stat_fin'].map(color_palette)

    fig, ax = plt.subplots(1, 1, figsize=(10, 10))

    # Plot the base map with greenspace changes

    df.plot(column='change_2011_2021', ax=ax, legend=False, cmap='Greens', alpha=0.9, edgecolor='black', linewidth=0.1)

    # Overlay gentrification status with distinct colors and thicker boundaries

    for category, color in color_palette.items():

        subset = df[df['gen_stat_fin'] == category]

        subset.plot(ax=ax, facecolor="none", edgecolor=color, linewidth=0.5)

    # Customize the legend

    handles = [Patch(facecolor='none', edgecolor='black', label='Boundary')]

    for category, color in color_palette.items():

        handles.append(Patch(facecolor='none', edgecolor=color, label=f'Gentrification Status {category}', linewidth=1))

    # Add legends for both greenspace changes and gentrification status

    plt.legend(handles=handles, title='Gentrification Status', loc='lower right')

    # Add colorbar for greenspace changes

    sm = plt.cm.ScalarMappable(cmap='Greens', norm=plt.Normalize(vmin=df['change_2011_2021'].min(), vmax=df['change_2011_2021'].max()))

    sm._A = []

    cbar = plt.colorbar(sm, ax=ax, fraction=0.03, pad=0.04)

    cbar.set_label('Change in Greenspaces 2011-2021')

    plt.title('Change in Greenspaces 2011-2021 and Gentrification Status in Philadelphia')

    plt.xlabel('Longitude')

    plt.ylabel('Latitude')

    plt.grid(True)  # Add grid lines

    plt.show()

def run_regression(df):

    """Run a regression model on greenspace changes and gentrification status."""

    formula = 'change_2011_2021 ~ C(gen_stat_fin)'

    model = smf.ols(formula, data=df).fit()

    print(model.summary())

def main():

    # Define the file path and output path

    filepath = 'PhilTCCGent.csv'

    output_path = 'data/Greenness_Statistics_Philadelphiab.xlsx'

    # Define the color palette for gentrification status

    color_palette = {1: "red", 2: "blue", 3: "yellow", 22: "purple", 99: "orange"}

    # Load the data

    philly_data_gdf = load_data(filepath)

    # Calculate and export statistics

    all_stats = compile_statistics(philly_data_gdf)

    export_statistics(all_stats, output_path)

    # Plot the map

    plot_map(philly_data_gdf, color_palette)

    # Run regression analysis

    run_regression(philly_data_gdf)

    # Optionally, print statistics

    print(all_stats)

# Run the main function

if __name__ == "__main__":

    main()

In the coming weeks I will be working on automating the interaction between the various scripts by turning them into modules, and creating a 'master script' to improve the organization and flow of the program.

1件のコメント