In [1]:

import pandas as pd

In [2]:

# Lists of data
data = {'Revenue': [274515,200734,182527,181945,143015,129184,92224,85965,84893,
                    82345,77867,73620,69864,63191],
        'Employees': [147000,267937,135301,878429,163000,197000,158000,58604,
                      109700,350864,110600,364800,85858,243540],
        'Sector': ['Consumer Electronics','Consumer Electronics','Software Services',
                   'Chip Manufacturing','Software Services','Consumer Electronics',
                   'Consumer Electronics','Software Services','Consumer Electronics',
                   'Consumer Electronics','Chip Manufacturing','Software Services',
                   'Software Services','Consumer Electronics'],
        'Founding Date':['01-04-1976','13-01-1969','04-09-1998','20-02-1974',
                         '04-04-1975','15-09-1987','01-02-1984','04-02-2004',
                         '07-04-1946','01-01-1910','18-07-1968','16-06-1911',
                         '11-11-1998','07-03-1918'],
        'Country':['USA','South Korea','USA','Taiwan','USA','China','USA','USA',
                   'Japan','Japan','USA','USA','China','Japan']} 
index = ['Apple','Samsung','Alphabet','Foxconn','Microsoft','Huawei',
         'Dell Technologies','Meta','Sony','Hitachi','Intel','IBM',
         'Tencent','Panasonic']

In [3]:

df = pd.DataFrame(data, index=index)

In [4]:

df

Out[4]:

	Revenue	Employees	Sector	Founding Date	Country
Apple	274515	147000	Consumer Electronics	01-04-1976	USA
Samsung	200734	267937	Consumer Electronics	13-01-1969	South Korea
Alphabet	182527	135301	Software Services	04-09-1998	USA
Foxconn	181945	878429	Chip Manufacturing	20-02-1974	Taiwan
Microsoft	143015	163000	Software Services	04-04-1975	USA
Huawei	129184	197000	Consumer Electronics	15-09-1987	China
Dell Technologies	92224	158000	Consumer Electronics	01-02-1984	USA
Meta	85965	58604	Software Services	04-02-2004	USA
Sony	84893	109700	Consumer Electronics	07-04-1946	Japan
Hitachi	82345	350864	Consumer Electronics	01-01-1910	Japan
Intel	77867	110600	Chip Manufacturing	18-07-1968	USA
IBM	73620	364800	Software Services	16-06-1911	USA
Tencent	69864	85858	Software Services	11-11-1998	China
Panasonic	63191	243540	Consumer Electronics	07-03-1918	Japan

Creating new columns¶

Expressions (and vectorized operations)¶

In [5]:

df["Revenue per Employee"] = df["Revenue"] / df["Employees"]

In [6]:

df.head()

Out[6]:

	Revenue	Employees	Sector	Founding Date	Country	Revenue per Employee
Apple	274515	147000	Consumer Electronics	01-04-1976	USA	1.867449
Samsung	200734	267937	Consumer Electronics	13-01-1969	South Korea	0.749184
Alphabet	182527	135301	Software Services	04-09-1998	USA	1.349044
Foxconn	181945	878429	Chip Manufacturing	20-02-1974	Taiwan	0.207125
Microsoft	143015	163000	Software Services	04-04-1975	USA	0.877393

Activities¶

1. Create a new column: `Revenue in $`¶

In [9]:

# Try your code here
df['Revenue in $'] = df['Revenue'] * 1000000
df.head()

Out[9]:

	Revenue	Employees	Sector	Founding Date	Country	Revenue per Employee	Revenue in $
Apple	274515	147000	Consumer Electronics	01-04-1976	USA	1.867449	274515000000
Samsung	200734	267937	Consumer Electronics	13-01-1969	South Korea	0.749184	200734000000
Alphabet	182527	135301	Software Services	04-09-1998	USA	1.349044	182527000000
Foxconn	181945	878429	Chip Manufacturing	20-02-1974	Taiwan	0.207125	181945000000
Microsoft	143015	163000	Software Services	04-04-1975	USA	0.877393	143015000000

2. Create a new column: `Is American?`¶

In [11]:

# Try your code here
df['Is American?'] = df['Country'] == 'USA'
df.head()

Out[11]:

	Revenue	Employees	Sector	Founding Date	Country	Revenue per Employee	Revenue in $	Is American?
Apple	274515	147000	Consumer Electronics	01-04-1976	USA	1.867449	274515000000	True
Samsung	200734	267937	Consumer Electronics	13-01-1969	South Korea	0.749184	200734000000	False
Alphabet	182527	135301	Software Services	04-09-1998	USA	1.349044	182527000000	True
Foxconn	181945	878429	Chip Manufacturing	20-02-1974	Taiwan	0.207125	181945000000	False
Microsoft	143015	163000	Software Services	04-04-1975	USA	0.877393	143015000000	True

Creating Columns out of Fixed Values¶

A single value¶

In [13]:

df['Is Tech?'] = "Yes"

In [14]:

df.head()

Out[14]:

	Revenue	Employees	Sector	Founding Date	Country	Revenue per Employee	Revenue in $	Is American?	Is Tech?
Apple	274515	147000	Consumer Electronics	01-04-1976	USA	1.867449	274515000000	True	Yes
Samsung	200734	267937	Consumer Electronics	13-01-1969	South Korea	0.749184	200734000000	False	Yes
Alphabet	182527	135301	Software Services	04-09-1998	USA	1.349044	182527000000	True	Yes
Foxconn	181945	878429	Chip Manufacturing	20-02-1974	Taiwan	0.207125	181945000000	False	Yes
Microsoft	143015	163000	Software Services	04-04-1975	USA	0.877393	143015000000	True	Yes

Collection of values¶

In [15]:

stock_prices = [143.28, 49.87, 88.26, 1.83, 253.75, 0,
                43.4, 167.32, 89.1, 52.6, 25.58, 137.35, 48.23, 8.81]

In [16]:

df['Stock Price'] = stock_prices

In [17]:

df.head()

Out[17]:

	Revenue	Employees	Sector	Founding Date	Country	Revenue per Employee	Revenue in $	Is American?	Is Tech?	Stock Price
Apple	274515	147000	Consumer Electronics	01-04-1976	USA	1.867449	274515000000	True	Yes	143.28
Samsung	200734	267937	Consumer Electronics	13-01-1969	South Korea	0.749184	200734000000	False	Yes	49.87
Alphabet	182527	135301	Software Services	04-09-1998	USA	1.349044	182527000000	True	Yes	88.26
Foxconn	181945	878429	Chip Manufacturing	20-02-1974	Taiwan	0.207125	181945000000	False	Yes	1.83
Microsoft	143015	163000	Software Services	04-04-1975	USA	0.877393	143015000000	True	Yes	253.75

Activities¶

3. Create a new column: `CEO`¶

In [18]:

ceo_list = [
    "Tim Cook", "Kim Ki Nam", "Sundar Pichai",
    "Young Liu", "Satya Nadella", "Ren Zhengfei",
    "Michael Dell", "Mark Zuckerberg",
    "Kenichiro Yoshida", "Toshiaki Higashihara", "Patrick Gelsinger",
    "Arvind Krishna", "Ma Huateng", "Yuki Kusumi"]

In [19]:

# Try your code here
df['CEO'] = ceo_list
df.head()

Out[19]:

	Revenue	Employees	Sector	Founding Date	Country	Revenue per Employee	Revenue in $	Is American?	Is Tech?	Stock Price	CEO
Apple	274515	147000	Consumer Electronics	01-04-1976	USA	1.867449	274515000000	True	Yes	143.28	Tim Cook
Samsung	200734	267937	Consumer Electronics	13-01-1969	South Korea	0.749184	200734000000	False	Yes	49.87	Kim Ki Nam
Alphabet	182527	135301	Software Services	04-09-1998	USA	1.349044	182527000000	True	Yes	88.26	Sundar Pichai
Foxconn	181945	878429	Chip Manufacturing	20-02-1974	Taiwan	0.207125	181945000000	False	Yes	1.83	Young Liu
Microsoft	143015	163000	Software Services	04-04-1975	USA	0.877393	143015000000	True	Yes	253.75	Satya Nadella

Deleting Columns with `del`¶

In [21]:

del df["Is Tech?"]

In [22]:

df.head()

Out[22]:

	Revenue	Employees	Sector	Founding Date	Country	Revenue per Employee	Revenue in $	Is American?	Stock Price	CEO
Apple	274515	147000	Consumer Electronics	01-04-1976	USA	1.867449	274515000000	True	143.28	Tim Cook
Samsung	200734	267937	Consumer Electronics	13-01-1969	South Korea	0.749184	200734000000	False	49.87	Kim Ki Nam
Alphabet	182527	135301	Software Services	04-09-1998	USA	1.349044	182527000000	True	88.26	Sundar Pichai
Foxconn	181945	878429	Chip Manufacturing	20-02-1974	Taiwan	0.207125	181945000000	False	1.83	Young Liu
Microsoft	143015	163000	Software Services	04-04-1975	USA	0.877393	143015000000	True	253.75	Satya Nadella

Activiies¶

4. Delete the column `CEO`¶

In [23]:

# Try your code here
del df['CEO']
df.head()

Out[23]:

	Revenue	Employees	Sector	Founding Date	Country	Revenue per Employee	Revenue in $	Is American?	Stock Price
Apple	274515	147000	Consumer Electronics	01-04-1976	USA	1.867449	274515000000	True	143.28
Samsung	200734	267937	Consumer Electronics	13-01-1969	South Korea	0.749184	200734000000	False	49.87
Alphabet	182527	135301	Software Services	04-09-1998	USA	1.349044	182527000000	True	88.26
Foxconn	181945	878429	Chip Manufacturing	20-02-1974	Taiwan	0.207125	181945000000	False	1.83
Microsoft	143015	163000	Software Services	04-04-1975	USA	0.877393	143015000000	True	253.75

Mutability vs Immutability¶

In [25]:

s = "Hello World"

In [26]:

s.replace("World", "Datawars")

Out[26]:

'Hello Datawars'

In [27]:

# not modified
s

Out[27]:

'Hello World'

In [28]:

s.lower()

Out[28]:

'hello world'

In [29]:

# not modified
s

Out[29]:

'Hello World'

Deleting rows¶

In [30]:

df.drop(["Microsoft", "Tencent", "Samsung", "Alphabet", "Meta", "Hitachi", "Apple"])

Out[30]:

	Revenue	Employees	Sector	Founding Date	Country	Revenue per Employee	Revenue in $	Is American?	Stock Price
Foxconn	181945	878429	Chip Manufacturing	20-02-1974	Taiwan	0.207125	181945000000	False	1.83
Huawei	129184	197000	Consumer Electronics	15-09-1987	China	0.655756	129184000000	False	0.00
Dell Technologies	92224	158000	Consumer Electronics	01-02-1984	USA	0.583696	92224000000	True	43.40
Sony	84893	109700	Consumer Electronics	07-04-1946	Japan	0.773865	84893000000	False	89.10
Intel	77867	110600	Chip Manufacturing	18-07-1968	USA	0.704042	77867000000	True	25.58
IBM	73620	364800	Software Services	16-06-1911	USA	0.201809	73620000000	True	137.35
Panasonic	63191	243540	Consumer Electronics	07-03-1918	Japan	0.259469	63191000000	False	8.81

In [31]:

# the underlying `df` has not changed
df.head()

Out[31]:

	Revenue	Employees	Sector	Founding Date	Country	Revenue per Employee	Revenue in $	Is American?	Stock Price
Apple	274515	147000	Consumer Electronics	01-04-1976	USA	1.867449	274515000000	True	143.28
Samsung	200734	267937	Consumer Electronics	13-01-1969	South Korea	0.749184	200734000000	False	49.87
Alphabet	182527	135301	Software Services	04-09-1998	USA	1.349044	182527000000	True	88.26
Foxconn	181945	878429	Chip Manufacturing	20-02-1974	Taiwan	0.207125	181945000000	False	1.83
Microsoft	143015	163000	Software Services	04-04-1975	USA	0.877393	143015000000	True	253.75

5. Drop Microsoft from the `df`¶

In [34]:

# Try your code here
df_no_windows = df.drop(['Microsoft'])

Mutable modification with `inplace`¶

In [36]:

# This method produces no result
df.drop("Huawei", inplace=True)

In [37]:

df

Out[37]:

	Revenue	Employees	Sector	Founding Date	Country	Revenue per Employee	Revenue in $	Is American?	Stock Price
Apple	274515	147000	Consumer Electronics	01-04-1976	USA	1.867449	274515000000	True	143.28
Samsung	200734	267937	Consumer Electronics	13-01-1969	South Korea	0.749184	200734000000	False	49.87
Alphabet	182527	135301	Software Services	04-09-1998	USA	1.349044	182527000000	True	88.26
Foxconn	181945	878429	Chip Manufacturing	20-02-1974	Taiwan	0.207125	181945000000	False	1.83
Microsoft	143015	163000	Software Services	04-04-1975	USA	0.877393	143015000000	True	253.75
Dell Technologies	92224	158000	Consumer Electronics	01-02-1984	USA	0.583696	92224000000	True	43.40
Meta	85965	58604	Software Services	04-02-2004	USA	1.466879	85965000000	True	167.32
Sony	84893	109700	Consumer Electronics	07-04-1946	Japan	0.773865	84893000000	False	89.10
Hitachi	82345	350864	Consumer Electronics	01-01-1910	Japan	0.234692	82345000000	False	52.60
Intel	77867	110600	Chip Manufacturing	18-07-1968	USA	0.704042	77867000000	True	25.58
IBM	73620	364800	Software Services	16-06-1911	USA	0.201809	73620000000	True	137.35
Tencent	69864	85858	Software Services	11-11-1998	China	0.813716	69864000000	False	48.23
Panasonic	63191	243540	Consumer Electronics	07-03-1918	Japan	0.259469	63191000000	False	8.81

6. Delete inplace the values for IBM and Dell¶

In [39]:

# Try your code here
df.drop(['IBM', 'Dell Technologies'], inplace = True)

Deleting rows based on a condition¶

In [41]:

df.sort_values(by='Revenue').head()

Out[41]:

	Revenue	Employees	Sector	Founding Date	Country	Revenue per Employee	Revenue in $	Is American?	Stock Price
Panasonic	63191	243540	Consumer Electronics	07-03-1918	Japan	0.259469	63191000000	False	8.81
Tencent	69864	85858	Software Services	11-11-1998	China	0.813716	69864000000	False	48.23
Intel	77867	110600	Chip Manufacturing	18-07-1968	USA	0.704042	77867000000	True	25.58
Hitachi	82345	350864	Consumer Electronics	01-01-1910	Japan	0.234692	82345000000	False	52.60
Sony	84893	109700	Consumer Electronics	07-04-1946	Japan	0.773865	84893000000	False	89.10

We need to replicate the following expression using conditions:

In [42]:

df.drop(["Intel", "Tencent", "Panasonic"])

Out[42]:

	Revenue	Employees	Sector	Founding Date	Country	Revenue per Employee	Revenue in $	Is American?	Stock Price
Apple	274515	147000	Consumer Electronics	01-04-1976	USA	1.867449	274515000000	True	143.28
Samsung	200734	267937	Consumer Electronics	13-01-1969	South Korea	0.749184	200734000000	False	49.87
Alphabet	182527	135301	Software Services	04-09-1998	USA	1.349044	182527000000	True	88.26
Foxconn	181945	878429	Chip Manufacturing	20-02-1974	Taiwan	0.207125	181945000000	False	1.83
Microsoft	143015	163000	Software Services	04-04-1975	USA	0.877393	143015000000	True	253.75
Meta	85965	58604	Software Services	04-02-2004	USA	1.466879	85965000000	True	167.32
Sony	84893	109700	Consumer Electronics	07-04-1946	Japan	0.773865	84893000000	False	89.10
Hitachi	82345	350864	Consumer Electronics	01-01-1910	Japan	0.234692	82345000000	False	52.60

The condition:

Companies with less than M$80,000

In [43]:

df.loc[df["Revenue"] < 80_000]

Out[43]:

	Revenue	Employees	Sector	Founding Date	Country	Revenue per Employee	Revenue in $	Is American?	Stock Price
Intel	77867	110600	Chip Manufacturing	18-07-1968	USA	0.704042	77867000000	True	25.58
Tencent	69864	85858	Software Services	11-11-1998	China	0.813716	69864000000	False	48.23
Panasonic	63191	243540	Consumer Electronics	07-03-1918	Japan	0.259469	63191000000	False	8.81

The companies that match that query:

In [44]:

df.loc[df["Revenue"] < 80_000].index

Out[44]:

Index(['Intel', 'Tencent', 'Panasonic'], dtype='object')

The resulting .drop() expression:

In [45]:

# .drop() and .sort_values() chaining in action
df.drop(df.loc[df["Revenue"] < 80_000].index).sort_values(by='Revenue')

Out[45]:

	Revenue	Employees	Sector	Founding Date	Country	Revenue per Employee	Revenue in $	Is American?	Stock Price
Hitachi	82345	350864	Consumer Electronics	01-01-1910	Japan	0.234692	82345000000	False	52.60
Sony	84893	109700	Consumer Electronics	07-04-1946	Japan	0.773865	84893000000	False	89.10
Meta	85965	58604	Software Services	04-02-2004	USA	1.466879	85965000000	True	167.32
Microsoft	143015	163000	Software Services	04-04-1975	USA	0.877393	143015000000	True	253.75
Foxconn	181945	878429	Chip Manufacturing	20-02-1974	Taiwan	0.207125	181945000000	False	1.83
Alphabet	182527	135301	Software Services	04-09-1998	USA	1.349044	182527000000	True	88.26
Samsung	200734	267937	Consumer Electronics	13-01-1969	South Korea	0.749184	200734000000	False	49.87
Apple	274515	147000	Consumer Electronics	01-04-1976	USA	1.867449	274515000000	True	143.28

7. Delete companies with revenue lower than the mean¶

In [46]:

# Try your code here
df_high_revenue = df.drop(df.loc[df['Revenue'] < df['Revenue'].mean()].index)

8. Drop the companies that are NOT from the USA¶

In [49]:

# Try your code here
df_usa_only = df.drop(df.loc[df['Country'] != 'USA'].index)

9. Japanese companies sorted by Revenue (desc)¶

In [51]:

df['Country'].unique()

Out[51]:

array(['USA', 'South Korea', 'Taiwan', 'Japan', 'China'], dtype=object)

In [52]:

# Try your code here
df_jp_desc = df.drop(df.loc[df['Country'] != 'Japan'].index).sort_values(by='Revenue', ascending=False)

Removing columns with Drop¶

In [ ]:

df.drop(['Revenue', 'Employees'], axis=1)

Statement of Completion#3a337a88

Intro to Pandas for Data Analysis

Modifying DataFrames: creating columns and more

Creating new columns¶

Expressions (and vectorized operations)¶

Activities¶

1. Create a new column: `Revenue in $`¶

2. Create a new column: `Is American?`¶

Creating Columns out of Fixed Values¶

A single value¶

Collection of values¶

Activities¶

3. Create a new column: `CEO`¶

Deleting Columns with `del`¶

Activiies¶

4. Delete the column `CEO`¶

Mutability vs Immutability¶

Deleting rows¶

5. Drop Microsoft from the `df`¶

Mutable modification with `inplace`¶

6. Delete inplace the values for IBM and Dell¶

Deleting rows based on a condition¶

7. Delete companies with revenue lower than the mean¶

8. Drop the companies that are NOT from the USA¶

9. Japanese companies sorted by Revenue (desc)¶

Removing columns with Drop¶

The End!¶

Statement of Completion#3a337a88

Intro to Pandas for Data Analysis

Modifying DataFrames: creating columns and more

Creating new columns¶

Expressions (and vectorized operations)¶

Activities¶

1. Create a new column: Revenue in $¶

2. Create a new column: Is American?¶

Creating Columns out of Fixed Values¶

A single value¶

Collection of values¶

Activities¶

3. Create a new column: CEO¶

Deleting Columns with del¶

Activiies¶

4. Delete the column CEO¶

Mutability vs Immutability¶

Deleting rows¶

5. Drop Microsoft from the df¶

Mutable modification with inplace¶

6. Delete inplace the values for IBM and Dell¶

Deleting rows based on a condition¶

7. Delete companies with revenue lower than the mean¶

8. Drop the companies that are NOT from the USA¶

9. Japanese companies sorted by Revenue (desc)¶

Removing columns with Drop¶

The End!¶

1. Create a new column: `Revenue in $`¶

2. Create a new column: `Is American?`¶

3. Create a new column: `CEO`¶

Deleting Columns with `del`¶

4. Delete the column `CEO`¶

5. Drop Microsoft from the `df`¶

Mutable modification with `inplace`¶