python数据分析的六个步骤(Python数据分析总结干货资料分享)
python数据分析的六个步骤(Python数据分析总结干货资料分享)Python数据分析:numpy、scipy、matplotlib、pandas、scikit-learn、keras… Python特点:简洁、开发效率高、运算速度慢、胶水特性(集成C语言) 目标:提取有用信息 手段:研究、概括、总结 ● Python与数据分析

(图片来自网络配图)
Python数据分析学结●概述
● 数据分析的含义与目标
方法:统计分析方法
目标:提取有用信息
手段:研究、概括、总结
● Python与数据分析
Python特点:简洁、开发效率高、运算速度慢、胶水特性(集成C语言)
Python数据分析:numpy、scipy、matplotlib、pandas、scikit-learn、keras…
● Python数据分析大家族
numpy:数据结构基础
scipy:强大的科学计算方法(矩阵分析、信号分析、数理分析…)
matplotlib:丰富的可视化套件
pandas:基础数据分析套件
scikit-learn:强大的数据分析建模库
keras:人工神经网络
● Python数据分析环境搭建
平台:Windows、Linux
科学计算工具:Anaconda
●Python数据分析基础
● numpy
开源、数据计算扩展;ndarray、多维操作、线性代数
● numpy使用程序
import numpy as np
def main():
    lst=[[1 3 5] [2 4 6]]
    print(type(lst))
    np_lst=np.array(lst)
    print(type(np_lst))
    np_lst=np.array(lst  dtype=np.float)
    print(np_lst.shape)
    print(np_lst.ndim)
    print(np_lst.dtype)
    print(np_lst.itemsize)
    print(np_lst.size)
if __name__=="__main__":
    main()
执行结果:
<class 'list'>
<class 'numpy.ndarray'>
(2  3)
2
float64
8
6
    
● numpy常用数组
print(np.zeros([2 4]))
print(np.ones([3 5]))
print(np.random.rand(2 4))
print(np.random.rand())
print("RandInt:")
print(np.random.randint(1 10 3))
print("Randn:")  # 标准正态分布
print(np.random.randn(2 4)) 
print("Choice")
print(np.random.choice([10 20 30]))
print("Distribute:")  # Beta分布
print(np.random.beta(1 10 100))
执行结果:
[[ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]]
[[ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]]
[[ 0.80307088  0.25491367  0.54381007  0.10159737]
 [ 0.71565024  0.62473538  0.66892166  0.41078071]]
0.16467244260637237
RandInt:
[5 3 2]
Randn:
[[-0.51707383 -1.46091351 -0.78197086  0.44640286]
 [-0.0998081   0.40701679  0.07750661  0.66041753]]
Choice
10
Distribute:
[ 0.03897375  0.09804991  0.1617222  ...   0.12878516  0.11699157
  0.05681225]
    
● numpy常用操作
print("Arange:")
print(np.arange(1 11))
print("Exp:")
print(np.exp(lst))
print("Exp2:")
print(np.exp2(lst))
print("Sqrt:")
print(np.sqrt(lst))
print("Sin:")
print(np.sin(lst))
print("Log:")
print(np.log(lst))
执行结果:
Arange:
[ 1  2  3  4  5  6  7  8  9 10]
Exp:
[[   2.71828183   20.08553692  148.4131591 ]
 [   7.3890561    54.59815003  403.42879349]]
Exp2:
[[  2.   8.  32.]
 [  4.  16.  64.]]
Sqrt:
[[ 1.          1.73205081  2.23606798]
 [ 1.41421356  2.          2.44948974]]
Sin:
[[ 0.84147098  0.14112001 -0.95892427]
 [ 0.90929743 -0.7568025  -0.2794155 ]]
Log:
[[ 0.          1.09861229  1.60943791]
 [ 0.69314718  1.38629436  1.79175947]]
lst=np.array([[[1 2 3 4] [4 5 6 7]] [[7 8 9 10] [10 11 12 13]] [[14 15 16 17] [18 19 20 11]]])
print(lst.sum(axis=2))
print(lst.sum(axis=1))
print(lst.sum(axis=0))
print("Max:")
print(lst.max(axis=1))
print("Min:")
print(lst.min(axis=0))  
执行结果:
[[10 22]
 [34 46]
 [62 68]]
[[ 5  7  9 11]
 [17 19 21 23]
 [32 34 36 28]]
[[22 25 28 31]
 [32 35 38 31]]
Max:
[[ 4  5  6  7]
 [10 11 12 13]
 [18 19 20 17]]
Min:
[[1 2 3 4]
 [4 5 6 7]]
lst1=np.array([10 20 30 40])
lst2=np.array([4 3 2 1])
print("Add:")
print(lst1 lst2)
print("Sub:")
print(lst1-lst2)
print("Mul:")
print(lst1*lst2)
print("Div:")
print(lst1/lst2)
print("Square:")
print(lst1**2)
print("Dot:")
print(np.dot(lst1.reshape([2 2]) lst2.reshape([2 2])))
print("Concatenate:")
print(np.concatenate((lst1 lst2) axis=0))
print("vstack:")
print(np.vstack((lst1 lst2)))
print("hstack:")
print(np.hstack((lst1 lst2)))
print("Split:")
print(np.split(lst1 2))
print(np.split(lst1 4))
print("Copy:")
print(np.copy(lst1))
执行结果:
Add:
[14 23 32 41]
Sub:
[ 6 17 28 39]
Mul:
[40 60 60 40]
Div:
[  2.5          6.66666667  15.          40.        ]
Square:
[ 100  400  900 1600]
Dot:
[[ 80  50]
 [200 130]]
Concatenate:
[10 20 30 40  4  3  2  1]
vstack:
[[10 20 30 40]
 [ 4  3  2  1]]
hstack:
[10 20 30 40  4  3  2  1]
Split:
[array([10  20])  array([30  40])]
[array([10])  array([20])  array([30])  array([40])]
Copy:
[10 20 30 40]
    
● 线程方程组
import numpy as np
from numpy.linalg import *
def main():
    print(np.eye(3))
    lst=np.array([[1 2] [3 4]])
    print("Inv:")
    print(inv(lst))
    print("T:")
    print(lst.transpose())
    print("Det:")
    print(det(lst))
    print("Eig:")
    print(eig(lst))
if __name__=="__main__":
    main()
执行结果:
[[ 1.  0.  0.]
 [ 0.  1.  0.]
 [ 0.  0.  1.]]
Inv:
[[-2.   1. ]
 [ 1.5 -0.5]]
T:
[[1 3]
 [2 4]]
Det:
-2.0
Eig:
(array([-0.37228132   5.37228132])  array([[-0.82456484  -0.41597356] 
       [ 0.56576746  -0.90937671]]))
    
● numpy其他方面应用
import numpy as np
from numpy.linalg import *
def main():
    print("FFT:")
    print(np.fft.fft(np.array([1 1 1 1 1 1 1 1])))
    print("Coef:")
    print(np.corrcoef([1 0 1] [0 2 1]))
    print("Poly:")
    print(np.poly1d([2 1 3]))
if __name__=="__main__":
    main()
执行结果:
FFT:
[ 8. 0.j  0. 0.j  0. 0.j  0. 0.j  0. 0.j  0. 0.j  0. 0.j  0. 0.j]
Coef:
[[ 1.        -0.8660254]
 [-0.8660254  1.       ]]
Poly:
   2
2 x   1 x   3
    
● matplotlib
● 概述
matplotlib是关键的绘图库。
● 实现
import numpy as np
import matplotlib.pyplot as plt
def main():
    #line
    x=np.linspace(-np.pi np.pi 256 endpoint=True)
    c s=np.cos(x) np.sin(x)
    plt.figure(1)
    plt.plot(x c color="blue" linewidth=1.0 linestyle="-" label="COS" alpha=0.5)
    plt.plot(x s "r*" label="SIN")
    plt.title("COS & SIN")
    ax=plt.gca()
    ax.spines["right"].set_color("none")
    ax.spines["top"].set_color("none")
    ax.spines["left"].set_position(("data" 0))
    ax.spines["bottom"].set_position(("data" 0))
    ax.xaxis.set_ticks_position("bottom")
    ax.yaxis.set_ticks_position("left")
    plt.show()
    #scatter
    fig=plt.figure()
    ax=fig.add_subplot(3 3 1)
    n=128
    X=np.random.normal(0 1 n)
    Y=np.random.normal(0 1 n)
    T=np.arctan2(Y X)
    #plt.axes([0.025 0.025 0.95 0.95])
    #plt.scatter(X Y s=75 c=T alpha=0.5)
    ax.scatter(X Y s=75 c=T alpha=0.5)
    plt.xlim(-1.5 1.5) plt.xticks([])
    plt.ylim(-1.5 1.5) plt.yticks([])
    plt.axis()
    plt.title("scatter")
    plt.xlabel("x")
    plt.ylabel("y") 
    plt.show()
    #bar
    fig.add_subplot(332)
    n=10
    X=np.arange(n)
    Y1=(1-X/float(n))*np.random.uniform(0.5 1.0 n)
    Y2=(1-X/float(n))*np.random.uniform(0.5 1.0 n)
    plt.bar(X  Y1 facecolor='#9999ff' edgecolor='white')
    plt.bar(X -Y2 facecolor='#9999ff' edgecolor='white')
    for x y in zip(X Y1):
        plt.text(x 0.4 y 0.05 '%.2f' % y ha='center' va='bottom')
    for x y in zip(X Y2):
        plt.text(x 0.4 -y-0.05 '%.2f' % y ha='center' va='bottom')       
    plt.show()
    #Pie
    fig.add_subplot(333)
    n=20
    Z=np.ones(n)
    Z[-1]*=2
    plt.pie(Z explode=Z*.05 colors=['%s' % (i / float(n)) for i in range(n)] 
            labels=['%.2f' % (i / float(n)) for i in range(n)])
    plt.gca().set_aspect('equal')
    plt.xticks([])  plt.yticks([])
    plt.show()
    #polar
    fig.add_subplot(334)
    n=20
    theta=np.arange(0.0 2*np.pi 2*np.pi/n)
    radii=10*np.random.rand(n)
    plt.plot(theta  radii)
    plt.show() 
    #beatmap
    fig.add_subplot(335)
    from matplotlib import cm
    data=np.random.rand(3 3)
    cmap=cm.Blues
    map=plt.imshow(data interpolation='nearest' cmap=cmap aspect='auto' vmin=0 vmax=1)
    plt.show()
    #hot map
    fig.add_subplot(313)
    def f(x y):
        return (1-x/2 x**5 y**3)*np.exp(-x**2-y**2)
    n=256
    x=np.linspace(-3 3 n)
    y=np.linspace(-3 3 n)
    X Y=np.meshgrid(x y)
    plt.contourf(X Y f(X Y) 8 alpha=.75 cmap=plt.cm.hot)
    plt.show()
    #3D
    ax=fig.add_subplot(336 projection="3d")
    ax.scatter(1 1 3 s=100)
    plt.show()
if __name__=="__main__":
    main()
    
● scipy
● 简介
数值计算库
● 积分
程序:
import numpy as np
from scipy.integrate import quad dblquad nquad
def main():
    # Integral
    print(quad(lambda x:np.exp(-x) 0 np.inf))
    print(dblquad(lambda t x:np.exp(-x*t)/t**3 0 np.inf lambda x:1 lambda x:np.inf))
    def f(x y):
        return x*y
    def bound_y():
        return [0 0.5]
    def bound_x(y):
        return [0 1-2*y]
    print(nquad(f [bound_x bound_y]))
if __name__=="__main__":
    main()
执行结果:
(1.0000000000000002  5.842607038578007e-11)
(0.3333333333366853  1.3888461883425516e-08)
(0.010416666666666668  4.101620128472366e-16)
    
● 优化器
import numpy as np
from scipy.optimize import minimize
def main():
    # Optimizer
    def rosen(x):
        return sum(100.0*(x[1:]-x[:-1]**2.0)**2.0 (1-x[:-1])**2.0)
    x0=np.array([1.3 0.7 0.8 1.9 1.2])
    res=minimize(rosen x0 method="nelder-mead" options={"xtol":1e-8 "disp":True})
    print("ROSE MINI:"  res)
if __name__=="__main__":
    main()
执行结果:
Optimization terminated successfully.
             Current function value: 0.000000
         Iterations: 339
         Function evaluations: 571
ROSE MINI:  final_simplex: (array([[ 1.           1.           1.           1.           1.        ] 
       [ 1.           1.           1.           1.           1.        ] 
       [ 1.           1.           1.           1.00000001   1.00000001] 
       [ 1.           1.           1.           1.           1.        ] 
       [ 1.           1.           1.           1.           1.        ] 
       [ 1.           1.           1.           1.           0.99999999]])  array([  4.86115343e-17    7.65182843e-17    8.11395684e-17 
         8.63263255e-17    8.64080682e-17    2.17927418e-16]))
           fun: 4.8611534334221152e-17
       message: 'Optimization terminated successfully.'
          nfev: 571
           nit: 339
        status: 0
       success: True
             x: array([ 1.   1.   1.   1.   1.])
    
● 插值
import numpy as np
from scipy.interpolate import interpld
def main():
    def fun(x):
        return x 2*np.cos(x)
    sol=root(fun 0.1)
    print("ROOT:" sol.x sol.fun)
    #Interpolation
    x=np.linspace(0 1 10)
    y=np.sin(2*np.pi*x)
    li=interpld(x y kind="cubic")
    x_new=np.linspace(0 1 50)
    y_new=li(x_new)
    figure()
    plot(x y "r")
    plot(x_new y_new "k")
    show()
    print(y_new)
if __name__=="__main__":
    main()
    
● 线性计算与矩阵分解
程序:
import numpy as np
from scipy import linalg as lg
def main():
    arr=np.array([[1 2] [3 4]])
    print("Det:" lg.det(arr))
    print("Inv:" lg.inv(arr))
    b=np.array([6 14])
    print("Sol:" lg.solve(arr b))
    print("Eig:" lg.eig(arr))
    print("LU:" lg.lu(arr))
    print("QR:" lg.qr(arr))
    print("SVD:" lg.svd(arr))
    print("Schur:" lg.schur(arr))
if __name__=="__main__":
    main()
执行结果:
Det: -2.0
Inv: [[-2.   1. ]
 [ 1.5 -0.5]]
Sol: [ 2.  2.]
Eig: (array([-0.37228132 0.j   5.37228132 0.j])  array([[-0.82456484  -0.41597356] 
       [ 0.56576746  -0.90937671]]))
LU: (array([[ 0.   1.] 
       [ 1.   0.]])  array([[ 1.           0.        ] 
       [ 0.33333333   1.        ]])  array([[ 3.           4.        ] 
       [ 0.           0.66666667]]))
QR: (array([[-0.31622777  -0.9486833 ] 
       [-0.9486833    0.31622777]])  array([[-3.16227766  -4.42718872] 
       [ 0.          -0.63245553]]))
SVD: (array([[-0.40455358  -0.9145143 ] 
       [-0.9145143    0.40455358]])  array([ 5.4649857    0.36596619])  array([[-0.57604844  -0.81741556] 
       [ 0.81741556  -0.57604844]]))
Schur: (array([[-0.37228132  -1.        ] 
       [ 0.           5.37228132]])  array([[-0.82456484  -0.56576746] 
       [ 0.56576746  -0.82456484]]))
    
● pandas
● 简介
数据分析库
● 基础数据分析技术
import numpy as np
import pandas as pd
def main():
    #Data Structure
    s=pd.Series([i*2 for i in range(1 11)])
    print(type(s))   
    dates=pd.date_range("20170301" periods=8)
    df=pd.DataFrame(np.random.randn(8 5) index=dates columns=list("ABCDE"))
    print(df)
    #Basic
    print(df.head(3))
    print(df.tail(3))
    print(df.index)
    print(df.values)
    print(df.T)
    print(df.sort(columns="C"))
    print(df.sort_index(axis=1 ascending=False))
    print(df.describe())
    #Select
    print(type(df["A"]))
    print(df[:3])
    print(df["20170301":"20170304"])
    print(df.loc[dates[0]])
    print(df.loc["20170301":"20170304" ["B" "D"]])
    print(df.iloc[1:2 2:4])
    print(df.iloc[1 4])
    print(df[df.B>0][df.A<0])
    print(df[df>0])
    print(df[df["E"].isin([1 2])])
    #Set
    s1=pd.Series(list(range(10 18)) index=pd.date_range("20170301" periods=8))
    df["F"]=s1
    print(df)
    df.at[dates[0] "A"]=0
    print(df)
    df.iat[1 1]=1
    df.loc[: "D"]=np.array([4]*len(df))
    df2=df.copy()
    df2[df2>0]=df2
    print(df2)
    #Missing Value
    df1=df.reindex(index=dates[:4] columns=list("ABCD") ["G"])
    df1.loc[dates[0]:dates[1] "G"]=1
    print(df1)
    print(df1.dropna())
    print(df1.fillna(value=2))
    #Concat
    pieces=[df[:3] df[-3:]]
    print(pd.concat(pieces))
    left=pd.DataFrame({"key":["x" "y"] "value":[1 2]})
    right=pd.DataFrame({"key":["x" "z"] "value":[3 4]})
    print("LEFT:" left)
    print("RIFHT:" right)
    print(pd.merge(left right on="key" how="left"))
    df3=pd.DataFrame({"A":["a" "b" "c" "b"] "B":list(range(4))})
    print(df3.groupby("A").sum())
if __name__=="__main__":
    main()
    
● 时间、绘图
import numpy as np
import pandas as pd
from pylab import *
def main():
    #Time Series
    t_exam=pd.date_range("20170301" periods=10 freq="S")
    print(t_exam)
    #Graph
    ts=pd.Series(np.random.randn(1000) index=pd.date_range("20170301" periods=1000))
    ts=ts.cumsum()
    ts.plot()
    show()
if __name__=="__main__":
    main()
    
● scikit-learn
● 简介
数据挖掘建模、机器学习
● 机器学习与决策树
机器学习:因子–>结果
结果:
不带标记–>无监督学习(聚类);带标记–>监督学习
有限离散–>分类;连续–>回归
决策树:监督学习;树形结构
● Iris数据集
● 花萼长度
● 花萼宽度
● 花瓣长度
● 花瓣宽度
● 种类:Iris Setosa(山鸢尾)、Iris Versicolour(杂色鸢尾)、Iris Virginica(维吉尼亚鸢尾)
● 实现
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.cross_validation import train_test_split
from sklearn import tree
from sklearn import metrics
def main():
    #Pre-processing
    iris=load_iris()
    print(iris)
    print(len(iris["data"]))
    train_data test_data train_target test_target=train_test_split(iris.data iris.target test_size=0.2 random_state=1)
    #Model
    clf=tree.DecisionTreeClassifier(criterion="entropy")
    clf.fit(train_data train_target)
    y_pred=clf.predict(test_data)
    #Verify
    print(metrics.accuracy_score(y_true=test_target y_pred=y_pred))
    print(metrics.confusion_matrix(y_true=test_target y_pred=y_pred))
if __name__=="__main__":
    main()
    
● keras
● 简介
人工神经网络
● 简单神经网络实现
Keras安装步骤:Anaconda CMD;conda install mingw libpython;pip install keras;pip install np_utils
● 实例
注意:需要需要C:/user/username/.keras/keras.json,具体改后内容如下:{“backend”: “theano” ”image_data_format”: “th” ”epsilon”: 1e-07 ”floatx”: “float32”}。
import numpy as np
from keras.models import Sequential
from keras.layers import Dense Activation
from keras.optimizers import SGD
from sklearn.datasets import load_iris
from sklearn.preprocessing import LabelBinarizer
from sklearn.cross_validation import train_test_split
def main():
    pass
    iris=load_iris()
    print(iris["target"])
    LabelBinarizer().fit_transform(iris["target"])
    train_data test_data train_target test_target=train_test_split(iris.data iris.target test_size=0.2 random_state=1)
    labels_train=LabelBinarizer().fit_transform(train_target)
    labels_test=LabelBinarizer().fit_transform(test_target)
    model=Sequential(
            [
                    Dense(5 input_dim=4) 
                    Activation("relu") 
                    Dense(3) 
                    Activation("sigmoid") 
            ]
            )
    # 优化器
    sgd=SGD(lr=0.01 decay=1e-6 momentum=0.9 nesterov=True)
    model.compile(optimizer=sgd loss="categorical_crossentropy")
    model.fit(train_data labels_train nb_epoch=200 batch_size=40)
    print(model.predict_classes(test_data))
    #model.save_weights("D:/w")
    #model.load_weights("D:/w")
if __name__=="__main__":
    main()
    
(欢迎私信小编有干货分享哦!)




