In [1]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:

df = pd.read_csv('california_cities.csv')
df.head()

Out[2]:

	city	latd	longd	elevation_m	elevation_ft	population_total	area_total_sq_mi	area_land_sq_mi	area_water_sq_mi	area_total_km2	area_land_km2	area_water_km2	area_water_percent
0	Adelanto	34.576111	-117.432778	875.0	2871.0	31765.0	56.027	56.009	0.018	145.107	145.062	0.046	0.03
1	AgouraHills	34.153333	-118.761667	281.0	922.0	20330.0	7.822	7.793	0.029	20.260	20.184	0.076	0.37
2	Alameda	37.756111	-122.274444	NaN	33.0	75467.0	22.960	10.611	12.349	59.465	27.482	31.983	NaN
3	Albany	37.886944	-122.297778	NaN	43.0	18969.0	5.465	1.788	3.677	14.155	4.632	9.524	NaN
4	Alhambra	34.081944	-118.135000	150.0	492.0	83089.0	7.632	7.631	0.001	19.766	19.763	0.003	NaN

In [3]:

df.columns

Out[3]:

Index(['city', 'latd', 'longd', 'elevation_m', 'elevation_ft',
       'population_total', 'area_total_sq_mi', 'area_land_sq_mi',
       'area_water_sq_mi', 'area_total_km2', 'area_land_km2', 'area_water_km2',
       'area_water_percent'],
      dtype='object')

Lets Explore¶

Part - I Idetification and Clearning Missing Values:¶

1. Find out the missing values in each column and store them in the variable col_missing_values

In [4]:

col_missing_values = df.isna().sum()

In [ ]:

2. Which Column has the most number of missing values?

In [ ]:

# you can check from the result of question 1 above

In [6]:

df.isnull().sum()

Out[6]:

city                   0
latd                  12
longd                 15
elevation_m           50
elevation_ft          14
population_total       2
area_total_sq_mi       6
area_land_sq_mi        3
area_water_sq_mi       5
area_total_km2         7
area_land_km2          6
area_water_km2         6
area_water_percent    62
dtype: int64

3. Which Column has the minimum missing values?

In [ ]:

df.isnull().sum()

4. How many values of the column elevation_m are missing?

In [ ]:

5. Find the total missing values in the whole dataset and store the number in df_missing_values

In [7]:

df_missing_values = df.isnull().sum().sum()

In [ ]:

6. Drop the column area_water_percent as it has the most of its values missing
Note: Drop it permanently by using inplace=True

In [9]:

df.drop('area_water_percent', axis=1, inplace=True)

In [ ]:

7. Drop the rows having missing values and store your resulting DataFrame in the variable df_narows_dropped

In [11]:

df_narows_dropped = df.dropna(axis = 0, inplace=False)

In [16]:

df.shape[1]

Out[16]:

In [28]:

df.value_counts('city', sort= False)

Out[28]:

city
Adelanto       1
AgouraHills    1
Alameda        1
Albany         1
Alhambra       1
              ..
Yountville     1
Yreka          1
YubaCity       1
Yucaipa        1
YuccaValley    1
Name: count, Length: 482, dtype: int64

8. Drop Rows with More Than 5 Missing Values

In [17]:

df_rows_dropped = df.dropna(thresh = df.shape[1]-5)

In [ ]:

9. Remove Columns with Missing Values

In [29]:

df_nacols_dropped = df.dropna(axis=1)

In [ ]:

10. Drop colomns with more than 10 missing values and store the resulting DataFrame in the variable df_cols_dropped

In [31]:

df_cols_dropped = df.dropna(thresh = df.shape[0] - 10 ,axis=1)

In [37]:

# count the missing values in each column and store it in missing_values
missing_valuees = df.isna().sum()

missing_valuees <= 10
# # create a new DataFrame containing columns that have less than or equal to 10 missing values
# df_cols_dropped = df.loc[:, missing_valuees <= 10]
# df_cols_dropped

Out[37]:

city                 True
latd                False
longd               False
elevation_m         False
elevation_ft        False
population_total     True
area_total_sq_mi     True
area_land_sq_mi      True
area_water_sq_mi     True
area_total_km2       True
area_land_km2        True
area_water_km2       True
dtype: bool

Part - II Data Imputation Methods:¶

In [38]:

df.isna().sum()

Out[38]:

city                 0
latd                12
longd               15
elevation_m         50
elevation_ft        14
population_total     2
area_total_sq_mi     6
area_land_sq_mi      3
area_water_sq_mi     5
area_total_km2       7
area_land_km2        6
area_water_km2       6
dtype: int64

11. Fill the 50 missing values in elevation_m with -999. Store your result in the variable filled_elevation_m

In [42]:

filled_elevation_m = df['elevation_m'].fillna(-999,inplace=False)

In [ ]:

12. Fill the 7 missing values in area_total_km2 with the value 0 permanently, store your result in the variable filled_area_total

In [47]:

df['area_total_km2'].fillna(0, inplace=True)
filled_area_total = df['area_total_km2']

In [52]:

df['latd'].isna()

Out[52]:

0      False
1      False
2      False
3      False
4      False
       ...  
477    False
478    False
479    False
480    False
481    False
Name: latd, Length: 482, dtype: bool

13. Fill the missing values of the column latd using backward filling method and store your result in the variable bfill_latd

In [53]:

bfill_latd = df['latd'].bfill()

In [58]:

df.isnull().sum()

Out[58]:

city                 0
latd                12
longd               15
elevation_m         50
elevation_ft        14
population_total     2
area_total_sq_mi     6
area_land_sq_mi      3
area_water_sq_mi     5
area_total_km2       0
area_land_km2        6
area_water_km2       6
dtype: int64

14. Fill the 15 missing values of the column longd using forwarding filling method and store your result in the variable ffill_longd

In [55]:

ffill_longd = df['longd'].ffill()

In [ ]:

15. Fill the 2 missing values of the column population_total with the mean of the column and store your result in the variable mean_total_population

In [61]:

mean_popu = df['population_total'].mean()
mean_total_population = df['population_total'].fillna(value = mean_popu)

In [ ]:

16. Fill the 5 missing values of the column area_water_sq_mi with the median value of the column and store your result in the variable median_fill

In [66]:

# find the median value of the column
median_value = df['area_water_sq_mi'].median()

# fill the missing values with the median value and store the result in a new dataframe
median_fill = df['area_water_sq_mi'].fillna(median_value)

In [ ]:

17. Fill the 6 missing values of the column area_land_km2 with the mode value of the column and store your result in the variable mode_fill

In [68]:

mode_fill = df['area_land_km2'].fillna(df['area_land_km2'].mode()[0], inplace=False)

In [ ]:

18. Which of the following code will fill the missing values in the DataFrame with zeros and store your result in a variable filled_df

In [ ]:

19. Which of the following is/are the general structure for filling a column values with any value (mean, mode, or median)

In [ ]:

Statement of Completion#4b750cd1

Data Cleaning with Pandas

Practice cleaning missing values with California Cities report

Lets Explore¶

Part - I Idetification and Clearning Missing Values:¶

Part - II Data Imputation Methods:¶