In [63]:

import pandas as pd
import numpy as np
import seaborn as sns
import missingno as msno

In [64]:

df = pd.read_csv('googleplaystore.csv')
df.sample(5)

Out[64]:

	App	Category	Rating	Reviews	Size	Installs	Type	Price	Content Rating	Genres	Last Updated	Current Ver	Android Ver
10682	Fruit Ninja Classic	GAME	4.3	85468	36M	1,000,000+	Paid	$0.99	Everyone	Arcade	8-Jun-18	2.4.1.485300	4.0.3 and up
731	English words application mikan	EDUCATION	4.7	9888	Varies with device	500,000+	Free	0	Everyone	Education	30-Jul-18	Varies with device	4.1 and up
7256	Wallpapers DAF CF Trucks	PERSONALIZATION	NaN	1	13M	100+	Free	0	Teen	Personalization	13-Jun-16	1	2.3.3 and up
872	Cinematic Cinematic	ENTERTAINMENT	4.4	37000	15M	1,000,000+	Free	0	Mature 17+	Entertainment	21-Jun-18	4.0.5	4.1 and up
8643	Wunderlist: To-Do List & Tasks	PRODUCTIVITY	4.6	404610	Varies with device	10,000,000+	Free	0	Everyone	Productivity	6-Apr-18	Varies with device	Varies with device

Data Cleaning¶

1. Which of the following column(s) has/have null values?¶

In [65]:

msno.bar(df)

Out[65]:

<Axes: >

No description has been provided for this image

In [66]:

df.info() #we can't get much from the bar plot so we use numerical info instead
#reviews have also some issues in fact when we print the data frame it only shows numbers but the type of the column is object

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  object 
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10840 non-null  object 
 9   Genres          10841 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10838 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.1+ MB

In [67]:

df.isnull().sum().sort_values(ascending=False)

Out[67]:

Rating            1474
Current Ver          8
Android Ver          3
Type                 1
Content Rating       1
App                  0
Category             0
Reviews              0
Size                 0
Installs             0
Price                0
Genres               0
Last Updated         0
dtype: int64

In [68]:

df.isnull().any()

Out[68]:

App               False
Category          False
Rating             True
Reviews           False
Size              False
Installs          False
Type               True
Price             False
Content Rating     True
Genres            False
Last Updated      False
Current Ver        True
Android Ver        True
dtype: bool

2. Clean the `Rating` column and the other columns containing null values¶

In [69]:

df['Rating'].plot(kind = 'hist') #there are some rating above 5 starts which is incorrect we gonna deal with that

Out[69]:

<Axes: ylabel='Frequency'>

In [70]:

df.loc[df['Rating'] > 5 , 'Rating' ] = np.nan

In [71]:

df.loc[df['Rating'] > 5 ] #fantastic question 1 done

Out[71]:

	App	Category	Rating	Reviews	Size	Installs	Type	Price	Content Rating	Genres	Last Updated	Current Ver	Android Ver

In [72]:

df['Rating'].describe()

Out[72]:

count    9326.000000
mean        4.197727
std         0.508178
min         1.000000
25%         4.000000
50%         4.300000
75%         4.500000
max         5.000000
Name: Rating, dtype: float64

In [73]:

df['Rating'] = df['Rating'].fillna(df['Rating'].mean())

In [74]:

df['Rating'].isnull().sum() # question 2 done

Out[74]:

In [75]:

df.dropna(inplace = True) #question 3 done

In [76]:

df.shape

Out[76]:

(10829, 13)

3. Clean the column `Reviews` and make it numeric¶

In [77]:

df['Reviews'].dtypes

Out[77]:

dtype('O')

In [78]:

# let's try to parse the review column to numeric but with a little argument that instead of raising an error when it's not
#possible to parse to numeric , it's just assigned a NaN to it 

#let's see where are the errors

df_reviews_errors = df.copy()
df_reviews_errors ['Reviews Errors'] = pd.to_numeric(df['Reviews'] , errors = 'coerce')
df_reviews_errors.loc[df_reviews_errors['Reviews Errors'].isna()]

Out[78]:

	App	Category	Rating	Reviews	Size	Installs	Type	Content Rating	Genres	Last Updated	Current Ver	Android Ver	Reviews Errors
72	Android Auto - Maps, Media, Messaging & Voice	AUTO_AND_VEHICLES	4.2	2M	16M	10,000,000+	Free	Teen	Auto & Vehicles	11-Jul-18	Varies with device	5.0 and up	NaN
1778	Block Craft 3D: Building Simulator Games For Free	GAME	4.5	1M	57M	50,000,000+	Free	Everyone	Simulation	5-Mar-18	2.10.2	4.0.3 and up	NaN
1781	Trivia Crack	GAME	4.5	6.4M	95M	100,000,000+	Free	Everyone	Trivia	3-Aug-18	2.79.0	4.1 and up	NaN

In [79]:

M_reviews  = pd.to_numeric(df.loc[df['Reviews'].str.contains('M') , 'Reviews'].str.replace('M' , '') ) * 1000000
K_reviews  = pd.to_numeric(df.loc[df['Reviews'].str.contains('K') , 'Reviews'].str.replace('K' , '') ) * 1000

In [80]:

M_reviews

Out[80]:

72      2000000.0
1778    1000000.0
1781    6400000.0
Name: Reviews, dtype: float64

In [81]:

K_reviews

Out[81]:

Series([], Name: Reviews, dtype: int64)

In [82]:

df.loc[df['Reviews'].str.contains('M') , 'Reviews'] = M_reviews.astype(str)

In [83]:

df.iloc[71] #Perfect

Out[83]:

App               Android Auto - Maps, Media, Messaging & Voice
Category                                      AUTO_AND_VEHICLES
Rating                                                      4.2
Reviews                                               2000000.0
Size                                                        16M
Installs                                            10,000,000+
Type                                                       Free
Price                                                         0
Content Rating                                             Teen
Genres                                          Auto & Vehicles
Last Updated                                          11-Jul-18
Current Ver                                  Varies with device
Android Ver                                          5.0 and up
Name: 72, dtype: object

In [84]:

#now let's make the column numerical
df['Reviews'] = pd.to_numeric(df['Reviews'])

In [85]:

df['Reviews'].dtypes

Out[85]:

dtype('float64')

4. How many duplicated apps are there?¶

In [86]:

df['App'].duplicated().any()

Out[86]:

True

In [87]:

#quick view of duplicates
df.loc[df['App'].duplicated(keep =False)].sort_values('App').head() # sort values to make the duplicates appear next to each others

Out[87]:

	App	Category	Rating	Reviews	Size	Installs	Type	Price	Content Rating	Genres	Last Updated	Current Ver	Android Ver
1393	10 Best Foods for You	HEALTH_AND_FITNESS	4.0	2490.0	3.8M	500,000+	Free	0	Everyone 10+	Health & Fitness	17-Feb-17	1.9	2.3.3 and up
1407	10 Best Foods for You	HEALTH_AND_FITNESS	4.0	2490.0	3.8M	500,000+	Free	0	Everyone 10+	Health & Fitness	17-Feb-17	1.9	2.3.3 and up
2543	1800 Contacts - Lens Store	MEDICAL	4.7	23160.0	26M	1,000,000+	Free	0	Everyone	Medical	27-Jul-18	7.4.1	5.0 and up
2322	1800 Contacts - Lens Store	MEDICAL	4.7	23160.0	26M	1,000,000+	Free	0	Everyone	Medical	27-Jul-18	7.4.1	5.0 and up
2385	2017 EMRA Antibiotic Guide	MEDICAL	4.4	12.0	3.8M	1,000+	Paid	$16.99	Everyone	Medical	27-Jan-17	1.0.5	4.0.3 and up

In [88]:

df['App'].duplicated(keep =False).sum()

#let's explain some things : by default pandas do the following :

# Twitter
# Twitter X (this is the duplicate)

# Facebook
# Facebook X 
# Facebook X
# Facebook X
# Facebook X

#means there are 5 duplicates (takes the original and count the rest as duplicates)
#but in this activity it's asked to count duplicates with the original one , so we enter the argument keep = False

Out[88]:

5. Drop duplicated apps keeping the ones with the greatest number of reviews¶

In [89]:

df = df.sort_values(['App' , 'Reviews'])
df = df.drop_duplicates(subset = ['App'], keep='last') #if ur wondering why i sorted items ascending and deleted the last item
#basically the activity checks it when the indexes are in this order , stupid yes !

In [90]:

df

Out[90]:

	App	Category	Rating	Reviews	Size	Installs	Type	Price	Content Rating	Genres	Last Updated	Current Ver	Android Ver
8884	"i DT" Fútbol. Todos Somos Técnicos.	SPORTS	4.197727	27.0	3.6M	500+	Free	0	Everyone	Sports	7-Oct-17	0.22	4.1 and up
324	#NAME?	COMICS	3.500000	115.0	9.1M	10,000+	Free	0	Mature 17+	Comics	13-Jul-18	5.0.12	5.0 and up
8532	+Download 4 Instagram Twitter	SOCIAL	4.500000	40467.0	22M	1,000,000+	Free	0	Everyone	Social	2-Aug-18	5.03	4.1 and up
4541	.R	TOOLS	4.500000	259.0	203k	10,000+	Free	0	Everyone	Tools	16-Sep-14	1.1.06	1.5 and up
4636	/u/app	COMMUNICATION	4.700000	573.0	53M	10,000+	Free	0	Mature 17+	Communication	3-Jul-18	4.2.4	4.1 and up
...	...	...	...	...	...	...	...	...	...	...	...	...	...
6334	뽕티비 - 개인방송, 인터넷방송, BJ방송	VIDEO_PLAYERS	4.197727	414.0	59M	100,000+	Free	0	Mature 17+	Video Players & Editors	18-Jul-18	4.0.7	4.0.3 and up
4362	💎 I'm rich	LIFESTYLE	3.800000	718.0	26M	10,000+	Paid	$399.99	Everyone	Lifestyle	11-Mar-18	1.0.0	4.4 and up
2575	💘 WhatsLov: Smileys of love, stickers and GIF	SOCIAL	4.600000	22098.0	18M	1,000,000+	Free	0	Everyone	Social	24-Jul-18	4.2.4	4.0.3 and up
7559	📏 Smart Ruler ↔️ cm/inch measuring for homework!	TOOLS	4.000000	19.0	3.2M	10,000+	Free	0	Everyone	Tools	21-Oct-17	1	4.2 and up
882	🔥 Football Wallpapers 4K \| Full HD Backgrounds 😍	ENTERTAINMENT	4.700000	11661.0	4.0M	1,000,000+	Free	0	Everyone	Entertainment	14-Jul-18	1.1.3.2	4.0.3 and up

9648 rows × 13 columns

In [91]:

df.loc[df['App'].duplicated(keep =False)].sort_values('App').head()

Out[91]:

	App	Category	Rating	Reviews	Size	Installs	Type	Price	Content Rating	Genres	Last Updated	Current Ver	Android Ver

6. Format the `Category` column¶

In [92]:

df.head()

Out[92]:

	App	Category	Rating	Reviews	Size	Installs	Type	Content Rating	Genres	Last Updated	Current Ver	Android Ver
8884	"i DT" Fútbol. Todos Somos Técnicos.	SPORTS	4.197727	27.0	3.6M	500+	Free	Everyone	Sports	7-Oct-17	0.22	4.1 and up
324	#NAME?	COMICS	3.500000	115.0	9.1M	10,000+	Free	Mature 17+	Comics	13-Jul-18	5.0.12	5.0 and up
8532	+Download 4 Instagram Twitter	SOCIAL	4.500000	40467.0	22M	1,000,000+	Free	Everyone	Social	2-Aug-18	5.03	4.1 and up
4541	.R	TOOLS	4.500000	259.0	203k	10,000+	Free	Everyone	Tools	16-Sep-14	1.1.06	1.5 and up
4636	/u/app	COMMUNICATION	4.700000	573.0	53M	10,000+	Free	Mature 17+	Communication	3-Jul-18	4.2.4	4.1 and up

In [93]:

df['Category'].value_counts() #doesn't seem to be wrong values , everything's good !

Out[93]:

Category
FAMILY                 1874
GAME                    945
TOOLS                   827
BUSINESS                420
MEDICAL                 395
PRODUCTIVITY            374
PERSONALIZATION         374
LIFESTYLE               369
FINANCE                 345
SPORTS                  325
COMMUNICATION           315
HEALTH_AND_FITNESS      288
PHOTOGRAPHY             281
NEWS_AND_MAGAZINES      254
SOCIAL                  239
BOOKS_AND_REFERENCE     221
TRAVEL_AND_LOCAL        219
SHOPPING                202
DATING                  170
VIDEO_PLAYERS           164
MAPS_AND_NAVIGATION     131
FOOD_AND_DRINK          112
EDUCATION               105
ENTERTAINMENT            86
AUTO_AND_VEHICLES        85
LIBRARIES_AND_DEMO       83
WEATHER                  79
HOUSE_AND_HOME           73
EVENTS                   64
ART_AND_DESIGN           60
PARENTING                60
COMICS                   56
BEAUTY                   53
Name: count, dtype: int64

In [94]:

df['Category'] = df['Category'].str.replace('_' , ' ' )


df['Category'] = df['Category'].apply(lambda x : x.lower().capitalize()) #lambda the best

In [95]:

df['Category'].value_counts()

Out[95]:

Category
Family                 1874
Game                    945
Tools                   827
Business                420
Medical                 395
Productivity            374
Personalization         374
Lifestyle               369
Finance                 345
Sports                  325
Communication           315
Health and fitness      288
Photography             281
News and magazines      254
Social                  239
Books and reference     221
Travel and local        219
Shopping                202
Dating                  170
Video players           164
Maps and navigation     131
Food and drink          112
Education               105
Entertainment            86
Auto and vehicles        85
Libraries and demo       83
Weather                  79
House and home           73
Events                   64
Art and design           60
Parenting                60
Comics                   56
Beauty                   53
Name: count, dtype: int64

7. Clean and convert the `Installs` column to numeric type¶

In [96]:

df.sample(5)

Out[96]:

	App	Category	Rating	Reviews	Size	Installs	Type	Price	Content Rating	Genres	Last Updated	Current Ver	Android Ver
2740	Nordstrom	Shopping	4.6	2278.0	12M	500,000+	Free	0	Everyone	Shopping	15-Jul-18	3.28.1.3.1359	4.4 and up
571	Moco+ - Chat, Meet People	Dating	4.2	1546.0	Varies with device	10,000+	Paid	$3.99	Mature 17+	Dating	19-Jun-18	2.6.139	4.1 and up
10792	Soccer Clubs Logo Quiz	Game	4.2	21661.0	16M	1,000,000+	Free	0	Everyone	Trivia	24-May-18	1.3.81	4.0 and up
6477	BM Pharmacy	Health and fitness	4.9	42.0	18M	1,000+	Free	0	Everyone	Health & Fitness	11-Jun-18	1.5.12	4.1 and up
4428	WiFi-o-Matic	Productivity	4.0	736.0	6.8M	50,000+	Free	0	Everyone	Productivity	1-Feb-18	1.80.03	4.1 and up

In [97]:

df['Installs'] =df['Installs'].str.replace('[^0-9]' , '' ,regex=True)
df.sample(5)

Out[97]:

	App	Category	Rating	Reviews	Size	Installs	Type	Content Rating	Genres	Last Updated	Current Ver	Android Ver
7768	Chest Simulator for Clash Royale	Family	4.4	4756.0	33M	100000	Free	Everyone	Simulation	19-Jul-18	1.1.9	4.1 and up
4375	Libre Scan (Diabetes:M addon)	Medical	2.4	102.0	1.2M	10000	Free	Everyone	Medical	9-Jul-17	1.2	4.4 and up
10225	Video Download For FB	Tools	4.3	444.0	1.6M	100000	Free	Everyone	Tools	14-Jul-17	1	3.0 and up
5642	Two Nights at jumpscare	Game	3.8	596.0	91M	100000	Free	Teen	Adventure	17-Aug-17	1	4.1 and up
813	Next Gen Science Standards	Education	4.3	206.0	18M	50000	Free	Everyone	Education	20-Dec-16	1.12.1	2.3 and up

In [98]:

df['Installs'] = pd.to_numeric(df['Installs'])

In [99]:

df['Installs'].dtypes

Out[99]:

dtype('int64')

8. Clean and convert the `Size` column to numeric (representing bytes)¶

In [100]:

df.sample()

Out[100]:

	App	Category	Rating	Reviews	Size	Installs	Type	Price	Content Rating	Genres	Last Updated	Current Ver	Android Ver
7551	Inch/cm/Foot Conversion	Productivity	4.0	319.0	2.1M	100000	Free	0	Everyone	Productivity	20-Aug-17	2.0.1	2.3.3 and up

In [101]:

df['Size'] =df['Size'].str.replace('[^a-zA-Z0-9.]' , '' ,regex=True)

In [102]:

M_size  = pd.to_numeric(df.loc[df['Size'].str.contains('M') , 'Size'].str.replace('M' , '') ) * 1_048_576
K_size  = pd.to_numeric(df.loc[df['Size'].str.contains('k') , 'Size'].str.replace('k' , '') ) * 1024

In [103]:

M_size.sample()

Out[103]:

3107    60817408.0
Name: Size, dtype: float64

In [104]:

K_size.sample()

Out[104]:

8218    110592.0
Name: Size, dtype: float64

In [105]:

df.loc[df['Size'].str.contains('M') , 'Size'] = M_size.astype(str)
df.loc[df['Size'].str.contains('k') , 'Size'] = K_size.astype(str)

In [106]:

df.loc[df['Size'].str.contains('[^0-9.]') , 'Size'].head()

Out[106]:

7338    Varieswithdevice
7330    Varieswithdevice
3448    Varieswithdevice
3151    Varieswithdevice
4875    Varieswithdevice
Name: Size, dtype: object

In [107]:

df['Size'] = df['Size'].str.replace('Varieswithdevice' , '0')

In [108]:

df.loc[df['Size'].str.contains('[^0-9.]') , 'Size'].head()

Out[108]:

Series([], Name: Size, dtype: object)

In [109]:

df['Size'] = df['Size'].astype('float')

In [110]:

df['Size'].dtypes

Out[110]:

dtype('float64')

9. Clean and convert the `Price` column to numeric¶

In [111]:

df.loc[df['Type'] == 'Paid'].sample()

Out[111]:

	App	Category	Rating	Reviews	Size	Installs	Type	Price	Content Rating	Genres	Last Updated	Current Ver	Android Ver
6132	BG Monitor Diabetes Pro	Medical	4.6	87.0	2936012.8	500	Paid	$5.99	Everyone	Medical	23-Apr-17	8.0.1	4.4 and up

In [112]:

df['Price'] = df['Price'].str.replace('$' , '')
df['Price'] = df['Price'].str.replace('Free' , '0')

In [113]:

df['Price'] = df['Price'].astype('float')

In [114]:

df.sample(5)

Out[114]:

	App	Category	Rating	Reviews	Size	Installs	Type	Price	Content Rating	Genres	Last Updated	Current Ver	Android Ver
465	imo free video calls and chat	Communication	4.300000	4785988.0	11534336.0	500000000	Free	0.00	Everyone	Communication	8-Jun-18	9.8.000000010501	4.0 and up
9827	ET Telecom from Economic Times	News and magazines	4.700000	273.0	6606028.8	10000	Free	0.00	Everyone	News & Magazines	7-Aug-18	2.9.0	4.1 and up
8571	Interactive NPC DM Tool	Family	4.197727	5.0	644096.0	50	Paid	0.99	Everyone	Role Playing	31-Jan-15	1.0.0	2.3.3 and up
7085	CA Mobile OTP	Tools	3.300000	688.0	5767168.0	100000	Free	0.00	Everyone	Tools	5-Sep-17	2.3.5	2.3.3 and up
1171	Citi Mobile®	Finance	4.000000	78306.0	48234496.0	5000000	Free	0.00	Everyone	Finance	31-Jul-18	9.9.0	4.4 and up

10. Paid or free?¶

In [115]:

df['Distribution'] = df['Type'].copy()
#stupid question , could've done it with lambda which iterate every value and assign paid or free based on a if statement

In [116]:

df.sample()

Out[116]:

	App	Category	Rating	Reviews	Size	Installs	Type	Price	Content Rating	Genres	Last Updated	Current Ver	Android Ver	Distribution
3241	Moto Voice	Tools	4.1	33216.0	0.0	10000000	Free	0.0	Everyone	Tools	5-Jun-18	Varies with device	Varies with device	Free

Analysis¶

11. Which app has the most reviews?¶

In [117]:

df.sort_values('Reviews' , ascending=False)

Out[117]:

	App	Category	Rating	Reviews	Size	Installs	Type	Price	Content Rating	Genres	Last Updated	Current Ver	Android Ver	Distribution
2544	Facebook	Social	4.100000	78158306.0	0.0	1000000000	Free	0.0	Teen	Social	3-Aug-18	Varies with device	Varies with device	Free
381	WhatsApp Messenger	Communication	4.400000	69119316.0	0.0	1000000000	Free	0.0	Everyone	Communication	3-Aug-18	Varies with device	Varies with device	Free
2604	Instagram	Social	4.500000	66577446.0	0.0	1000000000	Free	0.0	Teen	Social	31-Jul-18	Varies with device	Varies with device	Free
382	Messenger – Text and Video Chat for Free	Communication	4.000000	56646578.0	0.0	1000000000	Free	0.0	Everyone	Communication	1-Aug-18	Varies with device	Varies with device	Free
1879	Clash of Clans	Game	4.600000	44893888.0	102760448.0	100000000	Free	0.0	Everyone 10+	Strategy	15-Jul-18	10.322.16	4.1 and up	Free
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
5229	AJ+ Beta	News and magazines	4.197727	0.0	0.0	1000	Free	0.0	Everyone	News & Magazines	4-May-17	Varies with device	Varies with device	Free
5255	AJ and Alyssa	Lifestyle	4.197727	0.0	486400.0	100	Free	0.0	Everyone	Lifestyle	1-Jun-17	1.0.0	2.3.3 and up	Free
10692	MARKET FO	Communication	4.197727	0.0	15728640.0	100	Free	0.0	Everyone	Communication	30-Nov-17	1	4.1 and up	Free
5270	AJ Wallpapers	Personalization	4.197727	0.0	4089446.4	100	Free	0.0	Everyone	Personalization	6-May-18	2	4.0.3 and up	Free
9139	DZ Register	Productivity	4.197727	0.0	18874368.0	1	Free	0.0	Everyone	Productivity	31-Jul-18	1.0.6	4.1 and up	Free

9648 rows × 14 columns

12. What category has the highest number of apps uploaded to the store?¶

In [119]:

df['Category'].value_counts().sort_values(ascending=False).head(1)

Out[119]:

Category
Family    1874
Name: count, dtype: int64

13. To which category belongs the most expensive app?¶

In [125]:

df.sort_values('Price' , ascending=False )['Category'].head(1)

Out[125]:

4367    Lifestyle
Name: Category, dtype: object

14. What's the name of the most expensive game?¶

In [126]:

df.loc[df['Category'] == 'Game'].sort_values('Price' , ascending=False )['App'].head(1)

Out[126]:

4203    The World Ends With You
Name: App, dtype: object

15. Which is the most popular Finance App?¶

In [127]:

df.head()

Out[127]:

	App	Category	Rating	Reviews	Size	Installs	Type	Content Rating	Genres	Last Updated	Current Ver	Android Ver	Distribution
8884	"i DT" Fútbol. Todos Somos Técnicos.	Sports	4.197727	27.0	3774873.6	500	Free	Everyone	Sports	7-Oct-17	0.22	4.1 and up	Free
324	#NAME?	Comics	3.500000	115.0	9542041.6	10000	Free	Mature 17+	Comics	13-Jul-18	5.0.12	5.0 and up	Free
8532	+Download 4 Instagram Twitter	Social	4.500000	40467.0	23068672.0	1000000	Free	Everyone	Social	2-Aug-18	5.03	4.1 and up	Free
4541	.R	Tools	4.500000	259.0	207872.0	10000	Free	Everyone	Tools	16-Sep-14	1.1.06	1.5 and up	Free
4636	/u/app	Communication	4.700000	573.0	55574528.0	10000	Free	Mature 17+	Communication	3-Jul-18	4.2.4	4.1 and up	Free

In [128]:

df.loc[df['Category'] == 'Finance'].sort_values('Installs' , ascending=False )['App'].head(1)

Out[128]:

5601    Google Pay
Name: App, dtype: object

16. What Teen Game has the most reviews?¶

In [130]:

df.loc[(df['Category'] == 'Game') & (df['Content Rating'] == 'Teen')].sort_values('Reviews' , ascending=False )['App'].head(1)

Out[130]:

3912    Asphalt 8: Airborne
Name: App, dtype: object

17. Which is the free game with the most reviews?¶

In [133]:

df.loc[(df['Category'] == 'Game') & (df['Distribution'] == 'Free')].sort_values('Reviews' , ascending=False )['App'].head(1)

Out[133]:

1879    Clash of Clans
Name: App, dtype: object

18. How many Tb (tebibytes) were transferred (overall) for the most popular Lifestyle app?¶

In [135]:

df.loc[df['Category'] == 'Lifestyle'].sort_values('Installs' , ascending=False ).head(1)

Out[135]:

	App	Category	Rating	Reviews	Size	Installs	Type	Price	Content Rating	Genres	Last Updated	Current Ver	Android Ver	Distribution
4587	Tinder	Lifestyle	4.0	2789775.0	71303168.0	100000000	Free	0.0	Mature 17+	Lifestyle	2-Aug-18	9.5.0	4.4 and up	Free

In [136]:

tinder_size = 71303168.0
tinder_installations = 100000000

In [138]:

(tinder_size * tinder_installations) / (2**40)

Out[138]:

6484.9853515625

In [ ]:

Statement of Completion#b878c06c

Data Cleaning with Pandas

Capstone Project: Cleaning Google Playstore data

Data Cleaning¶

1. Which of the following column(s) has/have null values?¶

2. Clean the `Rating` column and the other columns containing null values¶

3. Clean the column `Reviews` and make it numeric¶

4. How many duplicated apps are there?¶

5. Drop duplicated apps keeping the ones with the greatest number of reviews¶

6. Format the `Category` column¶

7. Clean and convert the `Installs` column to numeric type¶

8. Clean and convert the `Size` column to numeric (representing bytes)¶

9. Clean and convert the `Price` column to numeric¶

10. Paid or free?¶

Analysis¶

11. Which app has the most reviews?¶

12. What category has the highest number of apps uploaded to the store?¶

13. To which category belongs the most expensive app?¶

14. What's the name of the most expensive game?¶

15. Which is the most popular Finance App?¶

16. What Teen Game has the most reviews?¶

17. Which is the free game with the most reviews?¶

18. How many Tb (tebibytes) were transferred (overall) for the most popular Lifestyle app?¶

Statement of Completion#b878c06c

Data Cleaning with Pandas

Capstone Project: Cleaning Google Playstore data

Data Cleaning¶

1. Which of the following column(s) has/have null values?¶

2. Clean the Rating column and the other columns containing null values¶

3. Clean the column Reviews and make it numeric¶

4. How many duplicated apps are there?¶

5. Drop duplicated apps keeping the ones with the greatest number of reviews¶

6. Format the Category column¶

7. Clean and convert the Installs column to numeric type¶

8. Clean and convert the Size column to numeric (representing bytes)¶

9. Clean and convert the Price column to numeric¶

10. Paid or free?¶

Analysis¶

11. Which app has the most reviews?¶

12. What category has the highest number of apps uploaded to the store?¶

13. To which category belongs the most expensive app?¶

14. What's the name of the most expensive game?¶

15. Which is the most popular Finance App?¶

16. What Teen Game has the most reviews?¶

17. Which is the free game with the most reviews?¶

18. How many Tb (tebibytes) were transferred (overall) for the most popular Lifestyle app?¶

2. Clean the `Rating` column and the other columns containing null values¶

3. Clean the column `Reviews` and make it numeric¶

6. Format the `Category` column¶

7. Clean and convert the `Installs` column to numeric type¶

8. Clean and convert the `Size` column to numeric (representing bytes)¶

9. Clean and convert the `Price` column to numeric¶