index
from pandas import Series, DataFrame
import pandas as pd
import os
obj = Series ([4,7,-5,3])
print(obj)
obj.values
obj.index
obj[1]
obj[[1,3]]
obj[obj>0]
obj[obj>3]
obj*2
4 in obj.values
data= {
'Student': ["Muhammad Abdullah","Kayli Abernathy","Jacqueline Alvarez",
"Daniel Chavez","Aramayis Dallakyan"],
'UIN' : [925024924,925024924,925024924,915140123,925124214],
'Graduation_Year' : [2017,2016,2018,2020,2018]
}
frame = DataFrame(data)
frame
frame.loc[frame['Student'].index, 'Last_name'] = frame['Student'].str.split().str[-1]
frame.loc[frame['Student'].index, 'First_name'] = frame['Student'].str.split().str[0]
frame
frame.drop(["Student"],axis=1,inplace= True)
frame["First_name"]
frame[frame["First_name"]=="Aramayis"]
os.chdir("C:\Users\dallakyan1988\Documents\Python_Presentation")
read_csv()
functionread_csv(filepath, sep=', ', header='infer', names=None, index_col=None, usecols=None,skiprows=None, nrows=None, na_values=None)
data1 = pd.read_csv("Orange_data.csv",sep = ",",na_values=' ')
data2 = pd.read_csv("Income.csv",sep = ",")
data2.set_index(pd.DatetimeIndex(data2['Date']),inplace = True)
data1.head()
data2.head()
del data2.index.name
data1.dtypes
data2.dtypes
def missing_count(data):
missing = pd.DataFrame(data.isnull().sum(),columns=['Total_missing'])
return missing
missing_count(data1)
missing_count(data2)
data1[data1["Per_Capita_Cons"].isnull()==True]
NAN
values (Not always the best)cleaned = data1.dropna()
missing_count(cleaned)
* all raw is **`NAN`**
* all column is **`NAN`**
* or by threshhold
from numpy import nan as NA
example = DataFrame([[2.,6.5,3.],[1.,NA,NA],[NA,NA,NA],[NA,4.,4.]])
example
example.dropna()
example.dropna(how="all")
example.dropna(thresh=2)
Method 2
NAN
values with meanlast_item_filled=data1.fillna(method='ffill')
mean_filled=data1.fillna(data1['Per_Capita_Cons'].mean())
data1[data1["Per_Capita_Cons"].isnull()==True]
last_item_filled.ix[data1[data1["Per_Capita_Cons"].isnull()==True].index,:]
mean_filled.ix[data1[data1["Per_Capita_Cons"].isnull()==True].index,:]
Method 3
import numpy as np
import matplotlib.pyplot as plt
from scipy import interpolate
from scipy.interpolate import Rbf,InterpolatedUnivariateSpline
x = np.arange(0, 2*np.pi+np.pi/4, 2*np.pi/8)
y = 0.8*np.power(x,5)- 3*np.power(x,2)
tck = interpolate.splrep(x, y, s=0)
xnew = np.arange(0, 2*np.pi, np.pi/50)
ynew = interpolate.splev(xnew, tck, der=0)
plt.figure()
plt.plot(x, y, 'x', xnew, ynew, xnew, 0.8*np.power(xnew,5)- 3* np.power(xnew,2), x, y, 'b')
plt.legend(['Linear', 'Cubic Spline', 'True','RBF'])
plt.axis([-0.05, 3.05, -4.05, 10])
plt.title('Cubic-spline interpolation')
plt.show()
linear_interpolated= data1['Per_Capita_Cons'].interpolate(method='linear')
spline_interpolated= data1['Per_Capita_Cons'].interpolate(method='spline',order=3)
plt.figure
plt.plot(xi, data1["Per_Capita_Cons"],xi, mean_filled["Per_Capita_Cons"],'g',xi,linear_interpolated,'b',xi,spline_interpolated,'r')
plt.legend(["Real",'mean_filled','Linear',"Spline"])
plt.show()
data1['Per_Capita_Cons'] = spline_interpolated
data2.head()
data2.tail()
from pandas import datetime
period = 7*12 + 9-2
index = pd.date_range('3/1/1995', periods=period, freq='m')
upsampled = data2["Per_Capita_Income"].resample('M')
interpolated = upsampled.interpolate(method='spline', order=3)
print(interpolated.head(10))
interpolated.plot()
plt.show()
interpolated = upsampled.interpolate(method='linear', order=3)
print(interpolated.head(10))
interpolated.plot()
plt.show()
interpolated=pd.DataFrame(interpolated)
interpolated.head()
data1.head()
period = 7*12 + 9
data1index = pd.date_range('1/1/1995', periods=period, freq='m')
data1.set_index(data1index,inplace = True)
data1.head()
Merged =DataFrame(pd.merge(data1,interpolated,left_index=True,right_index=True))
Merged.head()
Merged=Merged.drop(["Year","Month "],axis=1)
Merged.head()
Merged.columns
df1 = pd.DataFrame({'A': np.random.uniform(size=4),
'B': np.random.uniform(size=4),
'C': np.random.uniform(size=4),
'D': np.random.uniform(size=4)},
index=[0, 1, 2, 3])
df2 = pd.DataFrame({'A': np.random.normal(size=4),
'B': np.random.normal(size=4),
'C': np.random.normal(size=4),
'D': np.random.normal(size=4)},
index=[4, 5, 6, 7])
df3 = pd.DataFrame({'A':np.random.uniform(size=4),
'B': np.random.uniform(size=4),
'C': np.random.uniform(size=4),
'D': np.random.uniform(size=4)},
index=[8, 9, 10, 11])
df1
df2
df3
frames = [df1, df2, df3]
result = pd.concat(frames)
result
df4 = pd.DataFrame({'B': np.random.uniform(size=4),
'D':np.random.normal(size=4),
'F': np.random.normal(size=4)},
index=[2, 3, 6, 7])
df4
df1
result = pd.concat([df1, df4], axis=1)
result
result = df1.append(df2)
result
left = pd.DataFrame({'key': ['A', 'G', 'E', 'C'],
'A': np.random.normal(size=4),
'B': np.random.normal(size=4)})
right = pd.DataFrame({'key': ['A', 'G', 'E', 'C'],
'C': np.random.normal(size=4),
'D': np.random.normal(size=4)})
result = pd.merge(left, right, on='key')
left
right
result
Merged.describe()
returns= Merged.pct_change()
returns
plt.figure()
plt.plot(returns["Real_Price"].cumsum(),"b")
plt.xlabel("Time")
plt.ylabel("Percentage change")
plt.title("Cumulatice Percentage Change")
plt.show()
Merged.corr()
Merged.cov()
Merged["Per_Capita_Cons"].hist(bins=20)
plt.show()
Merged["Per_Capita_Cons"].hist(bins=20,normed= True)
Merged["Per_Capita_Cons"].plot(kind="kde")
plt.show()
comp1=np.random.normal(Merged["Per_Capita_Cons"].mean(),Merged["Per_Capita_Cons"].std(),size=200)
values = Series(comp1)
Merged["Per_Capita_Cons"].hist(bins=20,normed= True)
values.plot(kind="kde")
plt.show()
pd.scatter_matrix(Merged, diagonal="kde",alpha=0.5)
plt.show()