import requests
import pandas as pd
import numpy as np
import matplotlib as mpl
from matplotlib import pyplot as plt
import matplotlib as mpl
import seaborn as sn
import plotly.express as px
from wordcloud import WordCloud
import nltk
from nltk.corpus import wordnet
from bs4 import BeautifulSoup


gun = pd.read_csv('2013-2018.csv') # read data, retrieved the data frome https://github.com/jamesqo/gun-violence-data
gun_save = gun # save the original dataset


# these following columns are not needed in our project, so we deleted them here

del gun["incident_id"] # gunviolencearchive.org ID for incident
del gun["incident_url"] # link to gunviolencearchive.org webpage containing details of incident
del gun["source_url"] # link to online news story concerning incident
del gun["incident_url_fields_missing"] # ignore, always False
del gun["participant_status"] # key: participant ID, value: 'Arrested', 'Killed', 'Injured', or 'Unharmed'
del gun["sources"] # links to online news stories concerning incident
del gun['participant_name']

def modify_cell(x):
    """This function aims to clean the confusing punctuations and only select usful data"""
    if pd.isna(x):
        return x
    if not "::" in x:
        return x
    ture = []
    lst = x.split("||")
    for ele in lst:
        new = ele[3:]
        ture.append(new)
    return ture
# this function aims to clean the cells, remove :: and || from the cell, and ignore the first 3 letter after splits


gun_str = gun.applymap(str) # str all cells in dataframe
gun_str = gun_str.applymap(modify_cell) # clean the cell with the function
pd.set_option('display.max_colwidth', None)


# convert the n_killed column back to integer for later data visualization
gun_str['n_killed'] = gun_str['n_killed'].astype(int) 
# convert the n_injured column back to integer for later data visualization
gun_str['n_injured'] = gun_str['n_injured'].astype(int)

def column_sum(selected_column):
    '''This function is used to sum up all integers from a column'''
    x = 0
    for i in range(len(selected_column)):
        x = x + selected_column[i]
    return x


# Create new empty lists to store the number of victims and suspects in each incident
Victim = [0] * len(gun_str['participant_type'])
suspect = [0] * len(gun_str['participant_type'])

# Loop through the participant type column and count the number of victims and suspects in each incident
for i in range(len(gun_str['participant_type'])):
    Victim[i] = gun_str['participant_type'][0].count('Victim')
    suspect[i] = gun_str['participant_type'][0].count('Subject-Suspect')

# Create new columns in the dataframe for the number of victims and suspects in each incident
gun_str['Number_of_Victim'] = Victim
gun_str['Number_of_suspect'] = suspect


gun_str.head(5)


# create a new variable, df1_added_year, which is a copy of the original dataframe gun_str
df1_added_year = gun_str.copy()

# extract the year from the date column of the dataframe and create a new column 'Year_of_incident'
df1_added_year['Year_of_incident'] = pd.to_datetime(df1_added_year['date']).dt.year
incident_counts= df1_added_year['Year_of_incident'].value_counts().sort_index()
incident_counts_del = incident_counts.drop(labels=[2013, 2018])


fig, ax = plt.subplots(figsize=(6,4))

incident_counts_del.plot(ax=ax, kind='bar', xlabel='Year', ylabel='Number of Incidents',
                         width = 0.5,alpha=0.5)

# Add text to each bar
for i, v in enumerate(incident_counts_del.values):
    ax.text(i, v+500, str(v), ha='center')

# Set the title
ax.set_title('Gun Violence Incidents by Year (2014-2017)')

Text(0.5, 1.0, 'Gun Violence Incidents by Year (2014-2017)')


# make a copy of the original dataframe gun_str
df1_month = gun_str.copy()

# extract the year and month from the date column of the dataframe and create a new column 'year_month_of_incidence'
df1_month['year_month_of_incidence'] = pd.to_datetime(df1_month['date']).dt.to_period('M')

# group the dataframe by year_month_of_incidence and count the number of incidents
incidents_by_month = df1_month.groupby('year_month_of_incidence')['date'].count()


df2_month = df1_month[df1_month['year_month_of_incidence'].dt.year != 2013]

# group the dataframe by year_month_of_incidence and count the number of incidents
incidents_by_month_2 = df2_month.groupby('year_month_of_incidence')['date'].count()

# create a line chart of incidents by month
fig_monthly_incidents_2 = px.line(incidents_by_month_2, x=incidents_by_month_2.index.astype(str), y='date',
                                labels={'x': 'Month-Year', 'date': 'Number of Incidents'},
                                title='Gun Violence Incidents by Month from 2014 to 2018')
fig_monthly_incidents_2.show(renderer='notebook')


# Make a new df to include and count the incidents by state and rename the columns
states_count = (gun_str['state']
               .value_counts()
               .reset_index()
               .rename(columns={'index': 'state_name', 'state': 'incidents'})
               .astype({'incidents': int}))

# create a dict for creating a new column state_abbreviation
us_state_to_abbrev = {
    "Alabama": "AL","Alaska": "AK","Arizona": "AZ","Arkansas": "AR","California": "CA",
    "Colorado": "CO","Connecticut": "CT","Delaware": "DE","Florida": "FL","Georgia": "GA",
    "Hawaii": "HI","Idaho": "ID","Illinois": "IL","Indiana": "IN","Iowa": "IA","Kansas": "KS",
    "Kentucky": "KY","Louisiana": "LA","Maine": "ME","Maryland": "MD","Massachusetts": "MA",
    "Michigan": "MI","Minnesota": "MN","Mississippi": "MS","Missouri": "MO","Montana": "MT",
    "Nebraska": "NE","Nevada": "NV","New Hampshire": "NH","New Jersey": "NJ","New Mexico": "NM",
    "New York": "NY","North Carolina": "NC","North Dakota": "ND","Ohio": "OH","Oklahoma": "OK",
    "Oregon": "OR","Pennsylvania": "PA","Rhode Island": "RI","South Carolina": "SC","South Dakota": "SD",
    "Tennessee": "TN","Texas": "TX","Utah": "UT","Vermont": "VT","Virginia": "VA",
    "Washington": "WA","West Virginia": "WV","Wisconsin": "WI","Wyoming": "WY",
    "District of Columbia": "DC","American Samoa": "AS","Guam": "GU","Northern Mariana Islands": "MP",
    "Puerto Rico": "PR","United States Minor Outlying Islands": "UM","U.S. Virgin Islands": "VI",
}

# make a new column which contains the 2 letters abbr. of the original state name for later mapping
states_count['state_abbr'] = states_count['state_name'].map(us_state_to_abbrev)


# generate a map of the US territory divided by each states, the colors show the magnitude of gun violence incident in each state
# the darker the color, the more gun violence incidents reported in that state
# moving mouse on different colored states will show the number of gun violence incidents happened, the state name, and the state abbreviation.

fig_incidents_states = px.choropleth(states_count,
                                     locations='state_abbr',
                                     locationmode='USA-states',
                                     scope='usa',
                                     color='incidents',
                                     color_continuous_scale="Viridis_r",
                                     hover_data=['state_name', 'incidents'])
fig_incidents_states.update_layout(title_text='Gun Violence Incidents by State')
fig_incidents_states.show(renderer='notebook')


# get the rows in the df if the length of data in 'participant_age' equals to length of data in 'participant_type'

df_age = gun_str[gun_str['participant_age'].apply(len) == gun_str['participant_type'].apply(len)]

# check the observations we have if we drop the data based on df_age
df_age_group2 = df_age[df_age['participant_age_group'].apply(len) == df_age['participant_type'].apply(len)]
# check the observations we have if we drop the data based on df_age_group2
df_gender2 = df_age_group2[df_age_group2['participant_gender'].apply(len) == df_age_group2['participant_type'].apply(len)]
gun = df_gender2  # rename the dataframe


import warnings
warnings.filterwarnings('ignore')   # to hide the warning messages

# Use list comprehension to create lists for each participant type
victim_ages = [[age for j, age in enumerate(row['participant_age']) if row['participant_type'][j] == 'Victim'] for i, row in gun.iterrows()]
suspect_ages = [[age for j, age in enumerate(row['participant_age']) if row['participant_type'][j] == 'Subject-Suspect'] for i, row in gun.iterrows()]
victim_age_groups = [[age_group for j, age_group in enumerate(row['participant_age_group']) if row['participant_type'][j] == 'Victim'] for i, row in gun.iterrows()]
suspect_age_groups = [[age_group for j, age_group in enumerate(row['participant_age_group']) if row['participant_type'][j] == 'Subject-Suspect'] for i, row in gun.iterrows()]
victim_genders = [[gender for j, gender in enumerate(row['participant_gender']) if row['participant_type'][j] == 'Victim'] for i, row in gun.iterrows()]
suspect_genders = [[gender for j, gender in enumerate(row['participant_gender']) if row['participant_type'][j] == 'Subject-Suspect'] for i, row in gun.iterrows()]

# Add new columns to the 'gun' DataFrame to store the participant data
gun['suspect_ages'] = suspect_ages
gun['victim_ages'] = victim_ages
gun['victim_age_group'] = victim_age_groups
gun['suspect_age_group'] = suspect_age_groups
gun['victim_genders'] = victim_genders
gun['suspect_genders'] = suspect_genders


gun[['victim_ages','suspect_ages','victim_age_group','suspect_age_group','victim_genders',
     'suspect_genders']].head(10)


gun_reindex=gun.reset_index()

# Select relevant columns for age and gender data
age_data = gun_reindex[['victim_ages', 'suspect_ages', 'victim_age_group', 'suspect_age_group']]
gender_data = gun_reindex[['victim_genders', 'suspect_genders']]

# Flatten victim and suspect gender lists and count male/female frequency
victim_genders = [gender for gender_list in gender_data['victim_genders'] for gender in gender_list]
victim_male_count = victim_genders.count('Male')
victim_female_count = victim_genders.count('Female')

suspect_genders = [gender for gender_list in gender_data['suspect_genders'] for gender in gender_list]
suspect_male_count = suspect_genders.count('Male')
suspect_female_count = suspect_genders.count('Female')


# Combined Plot
fig, axs = plt.subplots(1, 2)
fig.suptitle("Gender Ratio")
counts = [victim_male_count, victim_female_count]
counts2 = [suspect_male_count, suspect_female_count]
labels = ['Male', 'Female']
colors = ['slategrey', 'peachpuff']
# Subplot 1
axs[0].pie(counts, labels=labels, colors=colors, autopct="%1.1f%%")
axs[0].axis('equal')
axs[0].set_title('Gender Ratio of Victims')
# Subplot 2
axs[1].pie(counts2, labels=labels, colors=colors, autopct='%1.1f%%')
axs[1].axis("equal")
axs[1].set_title('Gender Ratio of Suspects')

Text(0.5, 1.0, 'Gender Ratio of Suspects')


# create an empty list to store the victim ages
all_victim_ages = []

# iterate through each row of the DataFrame and append all the values in the "victim_ages" column to the new list
for index, row in age_data.iterrows():
    all_victim_ages += row['victim_ages']


# Convert the elements in the list to integers 

# create an empty list to store the integers
new_victim_age_list = []

for item in all_victim_ages:
    # use try() function to skip the elements that cannot be converted to integers
    try: 
        new_victim_age_list.append(int(item))
    except ValueError:
        continue


# create an empty list to store the victim ages
all_suspect_ages = []

# iterate through each row of the DataFrame and append all the values in the "victim_ages" column to the new list
for index, row in age_data.iterrows():
    all_suspect_ages += row['suspect_ages']
    
    
    
# Convert the elements in the list to integers 

# create an empty list to store the integers
new_suspect_age_list = []

for item in all_suspect_ages:
    # use try() function to skip the elements that cannot be converted to integers
    try: 
        new_suspect_age_list.append(int(item))
    except ValueError:
        continue


# Combined Plot
plt.rcParams["figure.autolayout"] = True
fig, axs = plt.subplots(1, 2, figsize = (9,4))
fig.suptitle('Distribution of Ages', fontsize=15)
x_lab = "Age"
y_lab = "Frequency"
# Subplot 1
axs[0].hist(new_victim_age_list, bins = 100, width = 1.5, color ='orange')
axs[0].set_title('Distribution of Victim Ages',fontsize=10)
axs[0].set_xlim([0, 80])
# Subplot 2
axs[1].hist(new_suspect_age_list, bins = 100, width = 1.5, color ='green')
axs[1].set_title('Distribution of Suspect Ages',fontsize=10)
axs[1].set_xlim([0, 80])
for i in range(2):
    axs[i].set_xlabel(x_lab)
    axs[i].set_ylabel(y_lab)


# original dataframe we use:
df=gun_str.copy()

# now we only fucous on the gun type for each state from 2013 to 2018
# so we select the data state gun_type variables to a ne dataframe we are going to use in our analysis 
df=df[['date','state','gun_type']]

# now append the gun_type list for each state, and we get the new dataframe 

def append_lists(x):
    result = []
    for l in x:
        result.extend(l)
    return result

grouped_df = df.groupby('date').agg({'state':"first",'gun_type': append_lists})

def filters(lst):
    return [element.replace(symbol, '') for element in lst for symbol in [':','::']]
grouped_df['gun_type'] = grouped_df['gun_type'].apply(lambda x: filters(x))

# delete the Unknow , n , a in the gun_type list. then we get the final dataset we can use 
stopwords = ["Unknown", "a", "n",":Unknown","::Unknown","Handgun","U","K","O","o","w",":"]
def filter_gun(lst):
    return [x for x in lst if x not in stopwords]
grouped_df['gun_type'] = grouped_df['gun_type'].apply(lambda x: filter_gun(x))

# let's generate a new dataframe:
def append_rows(group):
    b_combined = []
    [b_combined.extend(b) for b in group['gun_type'].tolist()]
    return pd.Series({'gun_type_sum': b_combined})

# group by column A and apply aggregation function to append rows
result = grouped_df.groupby('state').apply(append_rows)

# reset index to remove multi-level index
result = result.reset_index()

# display the resulting dataframe
df=result

def get_most_frequent(row):
    if len(row['gun_type_sum']) == 0:
        return pd.Series({'most_frequent': 0, 'count': 0})
    value_counts = pd.Series(row['gun_type_sum']).value_counts()
    most_frequent_value = value_counts.index[0]
    return pd.Series({'most_frequent': most_frequent_value, 'count': value_counts[0]})

result = df.merge(df.apply(get_most_frequent, axis=1), left_index=True, right_index=True)


# assume result is the DataFrame with the 'most_frequent' column
char = result[['most_frequent','count']]
category_counts = char['most_frequent'].value_counts()

# create a color map
color_map = plt.cm.get_cmap('Blues')

# create a pie chart
fig, ax = plt.subplots()
ax.pie(category_counts.values, labels=category_counts.index, autopct='%1.1f%%', 
       colors=color_map(np.linspace(0.3, 0.95, len(category_counts))), startangle=10, wedgeprops=dict(width=0.5, alpha=0.8), pctdistance=0.8)
ax.set_title('Most frequent gun types by state')
fig.tight_layout()
plt.show()


lemmatizer = nltk.WordNetLemmatizer()
def change_tag(tag):
    """
    FUNCTION:this is the fucntion to convert the brown POS tag into the wordnet tag
    For example, NNS, NNP and etc are all considered as noun, representing as 'n' in WordNet POS tag.
    RETURN: the words with wordnet tags 
    """
    table = {"N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV, "J": wordnet.ADJ}
    return table.get(tag[0], wordnet.NOUN)  # Default to a noun.


raw = ""
for x in gun_reindex["incident_characteristics"]:
    raw += x

words = nltk.word_tokenize(raw)
words = [x.lower() for x in words if x.isalnum()]
words_tags = nltk.pos_tag(words)
words = [lemmatizer.lemmatize(w, change_tag(t)) for (w, t) in words_tags]
stopword_self = ["gun", "dead", "fire", "find"]
stopwords = nltk.corpus.stopwords.words("english") + stopword_self
words = [w for w in words if w not in stopwords]


#fq=nltk.FreqDist(words)
#%matplotlib inline 
#fq.plot(50,cumulative=False)


text = ' '.join(words)
stopwords=['injuredshot','le']
wordcloud = WordCloud(width=1500, height=1000, random_state=45,stopwords=stopwords, background_color="black", collocations=False).generate(text)

# display the word cloud
plt.figure(figsize=(15,10))
plt.title("WordCloud for GV Report")
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()


# scraping from website
url = 'https://www.icip.iastate.edu/tables/employment/unemployment-states'
tables = pd.read_html(url)
df = tables[1].drop(["FIPS", "1980", "1990", "2000", "2010", "2018"], axis=1).drop(0, axis = 0)
df = df.set_index("Area Name")
# Then we extrat the number of total GV cases in 2017 and merge them into dataframe
start_date = '2017-01-01'
end_date = '2017-12-31'
mask = (gun_reindex['date'] > start_date) & (gun_reindex['date'] <= end_date)
gv_2017 = gun_reindex.loc[mask]
count2017 = gv_2017['state'] .value_counts().astype(float)
count2017 = count2017.reset_index().set_index("index")
merged_df = pd.merge(df[['2017']], count2017[['state']], left_index=True, right_index=True)
merged_df.columns = ["Unemployment Rate 2017", "Gun Violence"]
merged_df = merged_df.applymap(float)
# Correlation calculation
corr = merged_df["Unemployment Rate 2017"].corr(merged_df["Gun Violence"])


corr

0.26200459307773666


x = merged_df["Unemployment Rate 2017"]
y = merged_df["Gun Violence"]
a, b = np.polyfit(x, y, 1)
plt.scatter(x, y, alpha = 0.5)
plt.plot(x, a*x+b, color = "pink")
plt.xlabel("Unemployment Rate 2017 (%)")
plt.ylabel("Gun Violence Cases")
plt.title("Scatter Plot of Unemployment Rate 2017 vs Gun Violence")

Text(0.5, 1.0, 'Scatter Plot of Unemployment Rate 2017 vs Gun Violence')

	date	state	city_or_county	address	n_killed	n_injured	congressional_district	gun_stolen	gun_type	incident_characteristics	...	notes	participant_age	participant_age_group	participant_gender	participant_relationship	participant_type	state_house_district	state_senate_district	Number_of_Victim	Number_of_suspect
0	2013-01-01	Pennsylvania	Mckeesport	1506 Versailles Avenue and Coursin Street	0	4	14.0	nan	nan	Shot - Wounded/Injured\|\|Mass Shooting (4+ victims injured or killed excluding the subject/suspect/perpetrator, one location)\|\|Possession (gun(s) found during commission of other crimes)\|\|Possession of gun by felon or prohibited person	...	Julian Sims under investigation: Four Shot and Injured	[20]	[Adult 18+, Adult 18+, Adult 18+, Adult 18+, Adult 18+]	[Male, Male, Male, Female]	nan	[Victim, Victim, Victim, Victim, Subject-Suspect]	nan	nan	4	1
1	2013-01-01	California	Hawthorne	13500 block of Cerise Avenue	1	3	43.0	nan	nan	Shot - Wounded/Injured\|\|Shot - Dead (murder, accidental, suicide)\|\|Mass Shooting (4+ victims injured or killed excluding the subject/suspect/perpetrator, one location)\|\|Gang involvement	...	Four Shot; One Killed; Unidentified shooter in getaway car	[20]	[Adult 18+, Adult 18+, Adult 18+, Adult 18+]	[Male]	nan	[Victim, Victim, Victim, Victim, Subject-Suspect]	62.0	35.0	4	1
2	2013-01-01	Ohio	Lorain	1776 East 28th Street	1	3	9.0	[Unknown, Unknown]	[Unknown, Unknown]	Shot - Wounded/Injured\|\|Shot - Dead (murder, accidental, suicide)\|\|Shots Fired - No Injuries\|\|Bar/club incident - in or around establishment	...	nan	[25, 31, 33, 34, 33]	[Adult 18+, Adult 18+, Adult 18+, Adult 18+, Adult 18+]	[Male, Male, Male, Male, Male]	nan	[Subject-Suspect, Subject-Suspect, Victim, Victim, Victim]	56.0	13.0	4	1
3	2013-01-05	Colorado	Aurora	16000 block of East Ithaca Place	4	0	6.0	nan	nan	Shot - Dead (murder, accidental, suicide)\|\|Officer Involved Incident\|\|Officer Involved Shooting - subject/suspect/perpetrator killed\|\|Drug involvement\|\|Kidnapping/abductions/hostage\|\|Under the influence of alcohol or drugs (only applies to the subject/suspect/perpetrator )	...	nan	[29, 33, 56, 33]	[Adult 18+, Adult 18+, Adult 18+, Adult 18+]	[Female, Male, Male, Male]	nan	[Victim, Victim, Victim, Subject-Suspect]	40.0	28.0	4	1
4	2013-01-07	North Carolina	Greensboro	307 Mourning Dove Terrace	2	2	6.0	[Unknown, Unknown]	[Handgun, Handgun]	Shot - Wounded/Injured\|\|Shot - Dead (murder, accidental, suicide)\|\|Suicide^\|\|Murder/Suicide\|\|Attempted Murder/Suicide (one variable unsuccessful)\|\|Domestic Violence	...	Two firearms recovered. (Attempted) murder suicide - both succeeded in fulfilling an M/S and did not succeed, based on details.	[18, 46, 14, 47]	[Adult 18+, Adult 18+, Teen 12-17, Adult 18+]	[Female, Male, Male, Female]	[Family]	[Victim, Victim, Victim, Subject-Suspect]	62.0	27.0	4	1

	victim_ages	suspect_ages	victim_age_group	suspect_age_group	victim_genders	suspect_genders
2	[33, 34, 33]	[25, 31]	[Adult 18+, Adult 18+, Adult 18+]	[Adult 18+, Adult 18+]	[Male, Male, Male]	[Male, Male]
3	[29, 33, 56]	[33]	[Adult 18+, Adult 18+, Adult 18+]	[Adult 18+]	[Female, Male, Male]	[Male]
4	[18, 46, 14]	[47]	[Adult 18+, Adult 18+, Teen 12-17]	[Adult 18+]	[Female, Male, Male]	[Female]
6	[51, 40, 9, 5, 2]	[15]	[Adult 18+, Adult 18+, Child 0-11, Child 0-11, Child 0-11]	[Teen 12-17]	[Male, Female, Male, Female, Female]	[Male]
14	[34, 28, 23, 29]	[29]	[Adult 18+, Adult 18+, Adult 18+, Adult 18+]	[Adult 18+]	[Male, Male, Male, Male]	[Male]
18	[18, 22, 21, 29]	[19, 22, 23]	[Adult 18+, Adult 18+, Adult 18+, Adult 18+]	[Adult 18+, Adult 18+, Adult 18+]	[Male, Female, Female, Male]	[Male, Male, Male]
23	[18, 18, 18, 19]	[41]	[Adult 18+, Adult 18+, Adult 18+, Adult 18+]	[Adult 18+]	[Male, Male, Male, Female]	[Male]
27	[18, 18, 18, 19]	[15, 17]	[Adult 18+, Adult 18+, Adult 18+, Adult 18+]	[Teen 12-17, Teen 12-17]	[Male, Male, Male, Male]	[Male, Male]
29	[23, 34, 17, 25]	[]	[Adult 18+, Adult 18+, Teen 12-17, Adult 18+]	[]	[Male, Male, Female, Female]	[]
32	[33, 28, 29, 21]	[25]	[Adult 18+, Adult 18+, Adult 18+, Adult 18+]	[Adult 18+]	[Male, Female, Male, Male]	[Male]

STA141B Final Project: Gun Violence Analysis¶

Group1: Haitong Zhu, Mingyu Zhu, Kaizhong Mu, Yibo Li¶

I. Introduction¶

Goal¶

II. Dataset¶

a. Data Description¶

b. Data Cleaning¶

III. Data Exploring¶

a. Distribution of Gun Violence by time¶

By Year¶

By Month¶

b. Distribution of Gun Violence by location¶

c. Common Charactristics in Gun Violence cases¶

Gender¶

Age¶

d. Most Used Weapons in Crime¶

e. Causes of People commit Gun Violence¶

f. Relationship between Unemployment Rate and GV cases¶

IV. Conclusion¶

V. Refrence¶