Statement of Completion#6123ac7b
Intro to Pandas for Data Analysis
easy
Creating plots with pandas
Resolution
Activities
Project.ipynb
Creating plots with pandas¶
The plot method¶
In [1]:
import pandas as pd
import numpy as np
In [2]:
ts = pd.Series(np.random.randn(1000),
index=pd.date_range('1/1/2000', periods=1000))
In [3]:
ts = ts.cumsum()
In [4]:
ts.head()
Out[4]:
2000-01-01 -0.592325 2000-01-02 -1.417050 2000-01-03 -2.662854 2000-01-04 -3.204283 2000-01-05 -5.686098 Freq: D, dtype: float64
Line chart¶
In [5]:
ts.plot()
Out[5]:
<Axes: >
In [6]:
df = pd.DataFrame(np.random.randn(1000, 4),
index=ts.index, columns=list('ABCD'))
In [7]:
df.head()
Out[7]:
A | B | C | D | |
---|---|---|---|---|
2000-01-01 | -0.416102 | -0.469991 | 0.834004 | -0.999729 |
2000-01-02 | 1.182148 | 1.074082 | -0.809501 | 0.813045 |
2000-01-03 | 2.320584 | 0.524429 | 1.728922 | -0.936292 |
2000-01-04 | -0.021193 | -0.985823 | -2.142262 | -0.125014 |
2000-01-05 | -1.264067 | -1.737040 | -1.028364 | -0.809272 |
In [8]:
df = df.cumsum()
df
Out[8]:
A | B | C | D | |
---|---|---|---|---|
2000-01-01 | -0.416102 | -0.469991 | 0.834004 | -0.999729 |
2000-01-02 | 0.766046 | 0.604091 | 0.024503 | -0.186684 |
2000-01-03 | 3.086629 | 1.128520 | 1.753426 | -1.122976 |
2000-01-04 | 3.065436 | 0.142698 | -0.388836 | -1.247990 |
2000-01-05 | 1.801369 | -1.594342 | -1.417201 | -2.057262 |
... | ... | ... | ... | ... |
2002-09-22 | -29.887551 | -5.512215 | -27.085051 | -66.790310 |
2002-09-23 | -29.834693 | -6.891953 | -25.034900 | -67.620236 |
2002-09-24 | -28.368505 | -6.620989 | -25.083735 | -68.047712 |
2002-09-25 | -28.412433 | -7.138621 | -26.146851 | -66.215998 |
2002-09-26 | -29.115388 | -7.531759 | -26.402414 | -66.779953 |
1000 rows × 4 columns
In [9]:
df.plot()
Out[9]:
<Axes: >
Business sales data¶
In [10]:
sales_df = pd.read_csv('business-sales.csv')
In [11]:
sales_df.head()
Out[11]:
division | level of education | training level | work experience | salary | sales | |
---|---|---|---|---|---|---|
0 | computer hardware | associate's degree | 0 | 4 | 75650 | 209271 |
1 | office supplies | associate's degree | 2 | 7 | 96502 | 394503 |
2 | printers | some college | 0 | 2 | 55359 | 113345 |
3 | printers | associate's degree | 1 | 6 | 83955 | 301349 |
4 | computer software | some college | 1 | 6 | 103202 | 372674 |
In [12]:
sales_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1000 entries, 0 to 999 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 division 1000 non-null object 1 level of education 1000 non-null object 2 training level 1000 non-null int64 3 work experience 1000 non-null int64 4 salary 1000 non-null int64 5 sales 1000 non-null int64 dtypes: int64(4), object(2) memory usage: 47.0+ KB
In [13]:
sales_df.describe()
Out[13]:
training level | work experience | salary | sales | |
---|---|---|---|---|
count | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 |
mean | 1.212000 | 6.071000 | 89344.121000 | 333310.988000 |
std | 0.938049 | 3.027396 | 17693.218567 | 110626.544485 |
min | 0.000000 | 0.000000 | 41672.000000 | 37354.000000 |
25% | 0.000000 | 4.000000 | 77819.500000 | 260057.750000 |
50% | 1.000000 | 6.000000 | 89933.500000 | 335900.500000 |
75% | 2.000000 | 8.000000 | 101074.500000 | 405382.500000 |
max | 3.000000 | 16.000000 | 152529.000000 | 785291.000000 |
Plot types¶
Bar chart¶
In [14]:
division_values = sales_df['division'].value_counts()
division_values
Out[14]:
division printers 270 office supplies 246 peripherals 228 computer hardware 174 computer software 82 Name: count, dtype: int64
In [15]:
division_values.plot(kind="bar")
Out[15]:
<Axes: xlabel='division'>
In [16]:
division_values.plot(kind="bar", rot=30)
Out[16]:
<Axes: xlabel='division'>
In [17]:
division_values.plot(kind="bar",
rot=30,
color="orange")
Out[17]:
<Axes: xlabel='division'>
In [18]:
division_values.plot(kind="bar",
rot=30,
color=["red", "blue", "green", "yellow", "orange"])
Out[18]:
<Axes: xlabel='division'>
In [19]:
division_values.plot(kind="barh",
rot=30,
color=["red", "blue", "green", "yellow", "orange"])
Out[19]:
<Axes: ylabel='division'>
In [21]:
sales_per_division = sales_df[["division", "sales"]].groupby("division").mean().reset_index()
sales_per_division
Out[21]:
division | sales | |
---|---|---|
0 | computer hardware | 327411.333333 |
1 | computer software | 339389.036585 |
2 | office supplies | 336236.727642 |
3 | peripherals | 326656.192982 |
4 | printers | 338220.992593 |
In [23]:
sales_per_division["next_year_sales"] = sales_per_division["sales"] * 1.3
In [24]:
sales_per_division
Out[24]:
division | sales | next_year_sales | |
---|---|---|---|
0 | computer hardware | 327411.333333 | 425634.733333 |
1 | computer software | 339389.036585 | 441205.747561 |
2 | office supplies | 336236.727642 | 437107.745935 |
3 | peripherals | 326656.192982 | 424653.050877 |
4 | printers | 338220.992593 | 439687.290370 |
In [25]:
sales_per_division.plot(kind="bar",
x="division",
y="sales",
rot=30)
Out[25]:
<Axes: xlabel='division'>
In [26]:
sales_per_division.plot(kind="bar",
x="division",
y=["sales", "next_year_sales"],
rot=30)
Out[26]:
<Axes: xlabel='division'>
In [27]:
sales_per_division.plot(kind="bar",
x="division",
y=["sales", "next_year_sales"],
stacked=True,
rot=30)
Out[27]:
<Axes: xlabel='division'>
In [28]:
sales_per_division.plot(kind="bar",
x="division",
y=["sales", "next_year_sales"],
xlabel="Division",
ylabel="Total sales",
stacked=True,
rot=30)
Out[28]:
<Axes: xlabel='Division', ylabel='Total sales'>
Scatter plot¶
In [29]:
sales_df.plot(kind="scatter", x="salary", y="sales")
Out[29]:
<Axes: xlabel='salary', ylabel='sales'>
Histogram¶
In [30]:
sales_df['work experience'].plot(kind="hist")
Out[30]:
<Axes: ylabel='Frequency'>
In [31]:
sales_df['work experience'].plot(kind="hist", bins=20)
Out[31]:
<Axes: ylabel='Frequency'>
KDE¶
In [32]:
sales_df['work experience'].plot(kind='kde')
Out[32]:
<Axes: ylabel='Density'>
Pie chart¶
In [33]:
education_values = sales_df['level of education'].value_counts()
education_values
Out[33]:
level of education some college 401 associate's degree 393 high school 102 bachelor's degree 98 master's degree 6 Name: count, dtype: int64
In [34]:
education_values.plot(kind="pie")
Out[34]:
<Axes: ylabel='count'>
In [35]:
education_values.plot(kind="pie", figsize=(8,8))
Out[35]:
<Axes: ylabel='count'>
In [36]:
education_values.plot(kind="pie",
figsize=(8,8),
autopct='%1.2f%%')
Out[36]:
<Axes: ylabel='count'>
Area chart¶
In [37]:
experience_values = sales_df['work experience'].value_counts()
experience_values.head()
Out[37]:
work experience 7 133 5 130 6 118 8 115 4 84 Name: count, dtype: int64
In [38]:
experience_values = experience_values.sort_index()
experience_values.head()
Out[38]:
work experience 0 34 1 46 2 63 3 66 4 84 Name: count, dtype: int64
In [39]:
experience_values.plot(kind="area")
Out[39]:
<Axes: xlabel='work experience'>
In [40]:
experience_values_2 = experience_values / 2
experience_values_3 = experience_values / 7
In [41]:
new_df = pd.DataFrame({
"exp_1": experience_values,
"exp_2": experience_values_2,
"exp_3": experience_values_3,
})
new_df.head()
Out[41]:
exp_1 | exp_2 | exp_3 | |
---|---|---|---|
work experience | |||
0 | 34 | 17.0 | 4.857143 |
1 | 46 | 23.0 | 6.571429 |
2 | 63 | 31.5 | 9.000000 |
3 | 66 | 33.0 | 9.428571 |
4 | 84 | 42.0 | 12.000000 |
In [42]:
new_df.plot(kind="area")
Out[42]:
<Axes: xlabel='work experience'>
Boxplot¶
In [43]:
sales_df.plot(kind="box")
Out[43]:
<Axes: >
In [44]:
sales_df["salary"].plot(kind="box")
Out[44]:
<Axes: >
In [45]:
sales_df["work experience"].plot(kind="box")
Out[45]:
<Axes: >
Extra customizations¶
In [46]:
sales_df['work experience'].plot(kind="hist")
Out[46]:
<Axes: ylabel='Frequency'>
In [47]:
sales_df['work experience'].plot(kind="hist",
color="salmon")
Out[47]:
<Axes: ylabel='Frequency'>
In [48]:
sales_df['work experience'].plot(kind="hist",
color="salmon",
title="Histogram of Work experience")
Out[48]:
<Axes: title={'center': 'Histogram of Work experience'}, ylabel='Frequency'>
In [49]:
sales_df['work experience'].plot(kind="hist",
color="salmon",
figsize=(10,6),
title="Histogram of Work experience")
Out[49]:
<Axes: title={'center': 'Histogram of Work experience'}, ylabel='Frequency'>
In [50]:
import matplotlib.pyplot as plt
In [51]:
sales_df['work experience'].plot(kind="hist",
color="salmon",
figsize=(10,6),
label="Work experience in years",
title="Histogram of Work experience")
plt.legend()
Out[51]:
<matplotlib.legend.Legend at 0x7f8c5a915d90>
In [52]:
sales_df['work experience'].plot(kind="hist",
color="salmon",
figsize=(10,6),
grid=True,
label="Work experience in years",
title="Histogram of Work experience")
plt.legend()
Out[52]:
<matplotlib.legend.Legend at 0x7f8c5a8a7310>
In [53]:
sales_df['work experience'].plot(kind="hist",
color="salmon",
figsize=(10,6),
grid=True,
fontsize=14,
label="Work experience in years",
title="Histogram of Work experience")
plt.legend()
Out[53]:
<matplotlib.legend.Legend at 0x7f8c5a9161d0>
Exporting my chart¶
In [54]:
my_chart = sales_df['work experience'].plot(kind="hist",
color="salmon",
figsize=(10,6),
grid=True,
fontsize=14,
label="Work experience in years",
title="Histogram of Work experience")
plt.legend()
Out[54]:
<matplotlib.legend.Legend at 0x7f8c5a50d6d0>
In [ ]:
my_chart.figure.savefig("my-chart.png")
In [ ]:
my_chart.figure.savefig("my-chart.svg")