import numpy as np
import pandas as pd
df = pd.read_csv('./data/titanic.csv')
df.head()
data = {'country':['aaa','bbb','ccc'],'population':[10,12,14]}
df_data = pd.DataFrame(data)
df_data
df = df.set_index('Name')
df.head()
age['Braund, Mr. Owen Harris']
df = pd.read_csv('./data/titanic.csv')
df = df.set_index('Name')
df
df.loc['Heikkinen, Miss. Laina']
df.loc['Heikkinen, Miss. Laina','Fare']
df[df['Sex'] == 'male'][:5]
df.loc[df['Sex'] == 'male', 'Age'].mean()
df = pd.DataFrame({'key': ['A','B',"C",'A','B','C','A','B','C'],'data':[0,5,10,5,10,15,10,15,20]})
df
for key in ['A','B','C']:
print(key, df[df['key'] == key].sum())
df.groupby('key').aggregate(np.mean)
df = pd.read_csv('./data/titanic.csv')
df.groupby('Sex')['Age'].mean()
df.groupby('Sex')['Survived'].mean()
df = pd.DataFrame([[1,2,3],[4,5,6]],index=['a','b'],columns=['A','B','C'])
df
df = pd.read_csv('./data/titanic.csv')
df.head()
df['Age'].value_counts(ascending=True)
df['Pclass'].value_counts(ascending=True)
df['Age'].value_counts(ascending=True,bins=5)
data = [10,11,12]
index = ['a','b','c']
s = pd.Series(data=data,index=index)
s
mask = [True,False,True]
s[mask]
s1 = s.copy()
s1['a'] = 100
s1
s1.replace(to_replace=100,value=101,inplace=True)
s1
s1.index = ['a', 'b', 'd']
s1
s1.rename(index={'a':'A'},inplace=True)
s1
data = [100,110]
index = ['h','k']
s2 = pd.Series(data=data,index=index)
s2
s1.drop(['b','d'], inplace=True)
s1
data = [[1,2,3],[4,5,6]]
index = ['a','b']
columns = ['A','B','C']
df = pd.DataFrame(data=data,index=index,columns=columns)
df
df.loc['a']['A'] = 150
df
data = [[1,2,3],[4,5,6]]
index = ['j','k']
columns = ['A','B','C']
df2 = pd.DataFrame(data=data,index=index,columns=columns)
df2
df3 = pd.concat([df,df2],axis=0)
df3
df2['Test'] = [10,11]
df2
df4 = pd.DataFrame([[10,11],[12,13]],index=['j','k'],columns=['D','E'])
df4
df5 = pd.concat([df2,df4],axis=1)
df5
df5.drop(['j'],axis=0,inplace=True)
df5
df5.drop(['A','B','C'],axis=1,inplace=True)
df5
left = pd.DataFrame({'key':['K0','K1','K2','K3'],'A':['A0','A1','A2','A3'],'B':['B0','B1','B2','B3']})
right= pd.DataFrame({'key':['K0','K1','K2','K3'],'C':['C0','C1','C2','C3'],'D':['D0','D1','D2','D3']})
res = pd.merge(left,right)
res
res = pd.merge(left, right, on='key')
res
left = pd.DataFrame({'key1':['K0','K1','K2','K3'],'key2':['K0','K1','K2','K3'],'A':['A0','A1','A2','A3'],'B':['B0','B1','B2','B3']})
right= pd.DataFrame({'key1':['K0','K1','K2','K3'],'key2':['K0','K1','K2','K4'],'C':['C0','C1','C2','C3'],'D':['D0','D1','D2','D3']})
res = pd.merge(left,right, on=['key1','key2'])
res
res = pd.merge(left,right, on=['key1','key2'], how='outer')
res
res = pd.merge(left,right, on=['key1','key2'], how='outer', indicator=True)
res
res = pd.merge(left,right,how='left')
res
res = pd.merge(left,right,how='right')
res
pd.get_option('display.max_rows')
pd.Series(index=range(0,100))
pd.set_option('display.max_rows', 6)
pd.Series(index=range(0,100))
pd.get_option('display.max_columns')
pd.DataFrame(columns=range(0,30))
pd.set_option('display.max_columns', 20)
pd.DataFrame(columns=range(0,30))
pd.get_option('display.max_colwidth')
pd.Series(index=['A'],data=['t'*70])
pd.set_option('display.max_colwidth', 100)
pd.Series(index=['A'],data=['t'*70])
pd.get_option('display.precision')
pd.Series(data=[1.2345678923456])
pd.set_option('display.precision',5)
pd.Series(data=[1.2345678923456])
example = pd.DataFrame({'Month':["January","January","January","January",
"February","February","February","February",
"March","March","March","March"],
'Category':["Transportation","Grocery","Household","Entertainment",
"Transportation","Grocery","Household","Entertainment",
"Transportation","Grocery","Household","Entertainment"],
'Amount':[74.,235.,175.,100.,
115.,240.,225.,125.,
90.,260.,200.,120.]})
example_pivot = example.pivot(index='Category',columns='Month',values='Amount')
example_pivot
example_pivot.sum(axis=1)
example_pivot.sum(axis=0)
df = pd.read_csv('./data/titanic.csv')
df.head()
df.pivot_table(index='Sex',columns='Pclass',values='Fare')
df.pivot_table(index='Sex',columns='Pclass',values='Fare',aggfunc='max')
df.pivot_table(index='Sex',columns='Pclass',values='Fare',aggfunc='count')
df.pivot_table(index='Sex',columns='Pclass',values='Fare',aggfunc='mean')
df['Underaged'] = df['Age'] <= 18
df.pivot_table(index='Underaged',columns='Sex',values='Survived',aggfunc='mean')
dt = datetime.datetime(year=2020,month=10,day=5,hour=11,minute=9)
dt
ts = pd.Timestamp('2020-10-05')
ts
ts + pd.Timedelta('5 days')
pd.to_datetime('2020-10-05')
pd.to_datetime('10/05/2020')
s = pd.Series(['2020-10-01 00:00:00','2020-10-03 00:00:00','2020-10-05 00:00:00'])
s
ts = pd.to_datetime(s)
ts
pd.Series(pd.date_range(start='2020-10-05',periods=10,freq='12H'))
data = pd.read_csv('./data/flowdata.csv')
data.head()
data['Time'] = pd.to_datetime(data['Time'])
data = data.set_index('Time')
data
data = pd.read_csv('./data/flowdata.csv', index_col=0, parse_dates=True)
data.head()
data[pd.Timestamp('2012-01-01 09:00'):pd.Timestamp('2012-01-01 19:00')]
data[('2012-01-01 09:00'):('2012-01-01 19:00')]
data['2012-01':'2012-03']
data[data.index.month==1]
data.between_time('08:00','12:00')
data.resample('D').mean().head()
data.resample('D').max().head()
data.resample('3D').mean().head()
data.resample('M').mean().head()
import matplotlib.pyplot as plt
data.resample('M').mean().plot()
plt.show()
data = pd.DataFrame({'group':['a','a','a','b','b','b','c','c','c'],
'data':[4,3,2,1,12,3,4,5,7]})
data
data.sort_values(by=['group','data'],ascending=[False,True],inplace=True)
data
data = pd.DataFrame({'k1':['one']*3+['two']*4,
'k2':[3,2,1,3,3,4,4]})
data
data.sort_values(by='k2')
data.drop_duplicates(subset='k1')
data = pd.DataFrame({'food':['A1','A2','B1','B2','B3','C1','C2'],'data':[1,2,3,4,5,6,7]})
data
def food_map(series):
if series['food'] == 'A1':
return 'A'
elif series['food'] == 'A2':
return 'A'
elif series['food'] == 'B1':
return 'B'
elif series['food'] == 'B2':
return 'B'
elif series['food'] == 'B3':
return 'B'
elif series['food'] == 'C1':
return 'C'
elif series['food'] == 'C2':
return 'C'
data['food_map'] = data.apply(food_map, axis='columns')
data
food2Upper = {
'A1':'A',
'A2':'A',
'B1':'B',
'B2':'B',
'B3':'B',
'C1':'C',
'C2':'C'
}
data['upper']=data['food'].map(food2Upper)
data
titanic = pd.read_csv('./data/titanic.csv')
titanic.head()
def hundredth_row(columns):
item = columns.iloc[99]
return item
hundredth_row = titanic.apply(hundredth_row)
hundredth_row
def not_null_count(columns):
return len(columns[pd.isnull(columns)])
columns_null_count = titanic.apply(not_null_count)
columns_null_count
def which_class(row):
pclass = row['Pclass']
if pd.isnull(pclass):
return 'Unknow'
elif pclass == 1:
return 'First class'
elif pclass == 2:
return 'Second class'
elif pclass == 3:
return 'Third class'
classes = titanic.apply(which_class, axis=1)
classes
def is_minor(row):
if row['Age'] < 18:
return True
else:
return False
minors = titanic.apply(is_minor, axis=1)
minors
df = pd.DataFrame({'data1':np.random.randn(5),'data2':np.random.randn(5)})
df2 = df.assign(ration=df['data1']/df['data2'])
df2
df2.drop('ration',axis='columns',inplace=True)
df2
df2['ration']=df['data1']/df['data2']
df2
data = pd.Series([1,2,3,4,5,6,7,8,9])
data
data.replace(9, np.nan, inplace=True)
data
ages = [15,18,20,21,22,34,41,52,63,79]
bins = [10,40,80]
bins_res = pd.cut(ages,bins)
bins_res
pd.value_counts(bins_res)
pd.cut(ages,[10,30,50,80])
group_names = ['Yonth','Mille','Old']
pd.value_counts(pd.cut(ages,[10,20,50,80],labels=group_names))
df = pd.DataFrame([range(3),[0, np.nan, 0], [0,0,np.nan],range(3)])
df
df[df.isnull().any(axis=1)]
s = pd.Series(['A','b','B','gaer','AGER',np.nan])
s
index = pd.Index([' left',' middle ','right '])
index
df = pd.DataFrame(np.random.randn(3,2),columns=['A a','B b'],index=range(3))
df
df.columns = df.columns.str.replace(' ','_')
df
df = pd.DataFrame(np.random.randn(3,2),columns=['A a','B b'],index=range(3))
df.columns = df.columns.str.replace(' ','')
df
s = pd.Series(['a_b_C', 'c_d_e', 'f_g_h'])
s
s.str.split('_', expand=True)
s.str.split('_', expand=True, n=1)
s = pd.Series(['A', 'Aas', 'Afgew', 'Ager', 'Agre', 'Ager'])
s
s = pd.Series(['a','a|b','a|c'])
s
s.str.get_dummies(sep='|')