#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Mar 25 23:02:45 2023
@author: apache
"""
#数据结构
x=True
y=False
print(x & y)
print(x|y)
print(not y)
x=1
y=3
x+y
x-y/x**2.3
2**3
2**0.25
s='agdg'
s=r'dafd\'fdfdfa'
sd='dfadfd\
dafdfd'
sg=r'dfdaoj\dsafd'
print(sg)
http=r'C:\adf\fdf'
print(http)
d='fdafd\
00adfad'
d+sd
''.join(d)
'f'.join('56')
a='ab123\''
list(a)
'-'.join(list(a))
'-'.join(a)
a[0]
h=(1,3)
A=[1,'a',True,(1,2),[3,4]]
print(A)
type(A)
type(A[3])
ss={1,2,'a',2}
ss
1 in ss
1 not in ss
sb={2,3}
ss in sb
sb in ss
sb&ss
sb|ss
ss-sb
sa=set()
ss^sb
##数据组织 数据形式
import pandas as pd
from pandas import Series
se=pd.Series([1,2],index=['a','b'])
se
se.values
1 in se.values
'a' in se
type(se)
se.index
se.values
dict1={'a':1,'b':2}
print(dict1.values)
1 in dict1.values()
from pandas import DataFrame
df=DataFrame({'age':[1,2,3],'name':['a','b','c']})
df
len(df)
df.loc[:]
#程序结构
num=1;
if num==1:
print(num)
else:
print('TG');
if num==2:
print(2)
elif num==3:
print(3)
else:
print(1)
;
for i in range(0,10):
print(i)
;
for letter in 'Python':
print(letter)
;
i=1
while i<=9:
print(i)
i+=1
#函数
def functionname(parameters):
functionsuit
return[expr]
;
lambda para1,para2,...:functionsuit;
#向量化计算
numpy--array\arange\matrix
########################
#专题
#numpy
import numpy as np
help(np)
ndarr1=np.array([[1,2,3],[4,5,6],[7,8,9]])
type(ndarr)
ndarr+ndarr1
ndarr[:,:]
ndarr[:,0]
ndarr[:,0:3]
ndarr[:,0:2]
ndarr==ndarr1
ndarr in ndarr1
ndarr[:,1]==1
ndarr[:,1].any()==1 | 1==1
ndarr[:,1].any()==1 & 1==1
#numpy 常用函数
np.arange(10)
np.arange(0,5,0.5)
np.arange(10).reshape(5,2)
np.zeros((3,4))
np.ones((3,4))
np.random.random((3,4))
np.linspace(0,10,20).reshape(10,2)
a=np.arange(10).reshape(5,2)
a
a<5
a-1
a**2
a
b=a
a*b #对应位置相乘
#矩阵乘法的两种写法
a.dot(b.T)
np.dot(a,b.T)
ndarr=np.array(([1,2,3],[4,5,6]))
print(ndarr)
ndarr1=np.arange(0,20).reshape(5,4)
ndarr2=np.ones((5,5))
ndarr2 not in ndarr1
ndarr1.T.dot(ndarr2)
ndarr3=np.linspace(0,100,20)
type(ndarr3)
np.random.random()
np.random.random(3)
a=[1,2,3]
aa=np.array(a)
list(a)
b='123'
list(b)
#ndarray强复制与浅复制
#强复制
#浅复制
################################
# 专题 pandas
# 数据处理、数据存储
df=pd.read_excel('/Users/apache/Desktop/excel.xlsx')
type(df)
df.dtypes
df['A']*df['C']
print(df.dtypes)
help(pd.pandas)
help(pandas.read_excel)
df.head() #look through the data
df.head(10)
df.tail()
df.tail(10)
df2=df.head(10)
df2.dtypes
df.columns
head=df.columns
df.shape
df.loc[0] #取行
df['A'] #取列
df[['A','B']]
col=df.columns.tolist()
col1=list(df.columns)
df[col]
for col in col:
print(col)
for col in df.columns.tolist():
print(col,'\n') #'\n'换行
df.sort_values('A',ascending=False)
df.sort_values('A',ascending=False,inplace=True) #inplace=T 存储结果不展示
df['A'].isnull()
pd.isnull(df['A'])
len(df['A']) #统计A的个数,如果有nan
sum(df['A'])
mean=sum(df['A'])/len(df['A'])
# sum\len函数无法处理缺失值,计算之前先处理掉
a=df00[2][df00[2].isnull()==False]
sum(a)/len(a)
# mean()函数可以忽略缺失值
df00[2].mean()
#DataFrame格式的透视表功能
df.pivot_table(index='B',values='A',aggfunc='sum')
df.pivot_table(index='B',values='A',aggfunc=['sum','mean','count','max','min'])
x=df.pivot_table(index='B',values='A',aggfunc=['sum','mean','count'])
df.pivot_table(index='B',values='A') #若不指定aggfunc,default=mean
# 列联表
df.pivot_table(index='B',values=['A','C'],aggfunc=['sum','mean','count'])
# df指定到单元格数值,都是用中括号
df.loc[0,'A']
# 指定行
df.loc[1]
df[1:2]
# 指定列
df['A']
df.A
df[['A','B']] #指定多列
df.loc[:,['A','B']] #指定多列
#指定筛选条件
df[df['B']=='d']
df[df['B']=='a'][df['A']>=1]
;
#pandas函数应用apply
# pandas/DataFrame/Series结构
# Series可以看作是DataFrame中的一行或者一列!!!!重要的是1
#pd的DataFrame对标的是np的ndarray,array更强大 可以是1行列 也可以是多行列
# pd.df和np.ndarray之间的转换,np.ndarray=df.values
# ################################
# matplotlib
df2=pd.read_excel('/Users/apache/Desktop/excel2.xlsx')
pd.to_datetime(df2['date'])
df2['date']=pd.to_datetime(df2['date']) #转换成datetime格式
import matplotlib.pyplot as plt
plt.plot(df2)
plt.plot(df2['date'],df2['value'])
plt.show()
plt.plot(df2.head(30)['date'],df2.head(30)['value'])
plt.xticks(rotation=45)
plt.xlabel('year and month')
plt.ylabel('percentage %')
plt.title('matplotlib.pyplot')
plt.show()
df00=pd.read_excel('/Users/apache/Downloads/for py.xlsx')
rst=pd.pivot_table(df00,values=['bonus','real_amount']
,index='mall_name',aggfunc=['count','sum'])
pd.pivot_table(df00,values=['bonus','real_amount']\
,index='mall_name',aggfunc=['count','sum'])
# 在Python中,使用分号的情况相对较少,只有在需要将多个语句写在同一行上时才需要使用它。\
# print("Hello"); print("World!")
import matplotlib.pyplot as plt
fig=plt.figure(figsize=(10,5)) #新建空白画布并指定尺寸
fig1=plt.figure(figsize=(15,3))
fig=plt.plot(df00['trade_time'],df00['real_amount'])
fig1=plt.scatter(df00['bonus'],df00['real_amount'])
plt.show()
# 子图的创建
fig3=plt.figure(figsize=(10,5))
ax1=fig3.add_subplot(2,2,1)
ax2=fig3.add_subplot(2,2,2)
ax3=fig3.add_subplot(2,2,3)
ax4=fig3.add_subplot(2,2,4)
ax1.plot(np.arange(30),np.random.randn(30),c='red',label='y\'s label')
ax1.legend(loc='best') #显示图例用的 label
ax2.plot(np.arange(30),np.random.randn(30))
ax4.plot(np.arange(30),np.random.randn(30))
# 柱状图
fig411=plt.figure(figsize=(3,3))
ax1=fig411.add_subplot(3,3,1)
ax9=fig411.add_subplot(3,3,9)
ax1.bar(np.arange(20),np.random.random(20))
plt.bar(np.arange(10),np.random.random(10))
ax9.scatter(np.random.random(30),np.random.random(30),c='red')
# histgram
plt.hist(np.random.random(3),bins=3,density=False)
plt.hist(df00['cons_bonus_origin'],bins=10)
plt.ylim(10)
# boxplot
plt.boxplot(np.random.randn(30))
plt.boxplot(df00['cons_bonus_origin'])
# matplotlib.pyplot vs seaborn
# seaborn是在matplotlib基础上进一步封装整合统计常用图 表
import seaborn as sn
sn.set() #套用seaborn风格模版画图
x=np.linspace(0, 10,100)
np.sin(x)
plt.plot(x,np.sin(x))
y=np.random.randn(10000)
plt.hist(y)
plt.boxplot(y)
y1=np.random.rand(10000)
plt.hist(y1)
plt.boxplot(y1)
y2=np.random.random_sample(10000)
plt.hist(y2)
# 设置主题色
# darkgrid, whitegrid, dark, white, ticks -- sn.set-style()
sn.set_style('whitegrid')
y=np.random.randn(10000)
plt.hist(y)
plt.boxplot(y)
# 热力图是点位图,所以必须是x-y两维数据才可以画出来,eg(2,1)、(3,3)....
heatmapdata=np.random.random(size=[30,3])
sn.heatmap(heatmapdata,annot=True,center=0)
heatmapdata1=df00.pivot_table(values='real_amount',index='cons_bonus_origin',aggfunc='count')
heatmapdata1
sn.heatmap(heatmapdata1,annot=True,fmt='d')
##############################
# 案例分析 1
##############################
#1 - descriptive analysis
df00.shape
df00.describe()
df00.describe().T
desc_rst=df00.describe().T
# sampling a sample
df00.sample(30)
#2 - 聚合计算groupby vs pd.pivot_table(index=,value=,aggfunc=)
df00['real_amount'].groupby(df00['shop_name']).sum()
df00['real_amount'].groupby(df00['shop_name']).mean()
sales_rst=df00['real_amount'].groupby(df00['shop_name']).sum()
ATV_rst=df00['real_amount'].groupby(df00['shop_name']).mean()
import matplotlib.pyplot as plt
import seaborn as sbn
sbn.set()
sbn.set_style('whitegrid')
plt.hist(ATV_rst,bins=20)
# 字段值的个数
ATV_rst.value_counts()
ATV_dist_rst=ATV_rst.value_counts()
# 3 - missing value process
# use module
# pip install missingno
import missingno as msno
df00_sample=df00.sample(500)
msno.matrix(df00_sample)
msno.matrix(df00)
# use func
df00.isnull()
df00.isnull().sum() #统计df中各字段的缺失值数量
#np.sum(True)=1
#np.sum(False)=0
# True*1=1,False*1=0
df00['shop_code'].isnull()
df00['shop_code'].isnull().value_counts()
len(df00['shop_code']) #总行数
df00['shop_code'][df00['shop_code'].isnull()==True]
col=df00.columns.tolist()
for col in col:
print(col,'\n',df00[col].isnull().value_counts())
print(col,'\n',df00[col].isnull().value_counts().loc[True])
print(col,'\n',df00[col])
#4 - 交叉表,只用来计数,比groupby好用,排除缺失值
#crosstab可以配合seaborn中的heatmap可视化
bb=pd.crosstab(df00['store_A_cate'],df00['store_B_cate'])
import seaborn as sbn
sbn.heatmap(bb)
#5
#pd.unique()字段值类别统计,=select distinct Field,缺失值也会统计
# pd.nunique() 字段值类别个数,=select count(distinct Field)
#pd.value_counts()相当于分组汇总且计数,= select Field,count(*) group by Field 缺失值不统计
#pd的分段函数pd.qcut()
pd.qcut(df00['confidence'],4) #平均分成四段
pd.qcut(df00['confidence'],4).value_counts()
a=pd.qcut(df00['confidence'],4) #平均分成四段
#将结果写到df表中的办法
df00['confidence_cate']=pd.qcut(df00['confidence'],4)
#查看df各个字段的数值类型
df00.dtypes
##############################
# 案例分析 2
##############################
# 分组画boxplot
df00[['real_amount','mall_name']].sample(50).dropna().boxplot(by='mall_name')
plt.boxplot(df00[['real_amount']].sample(50).dropna())
# 透视表的三种实现方式 crosstab、pivot_table、groupby依次好用
rst1=pd.crosstab(bi.project_name,bi.year,values=bi.sales,aggfunc='sum')
rst2=bi.sales.groupby([bi.project_name,bi.year]).sum()
rst3=bi.pivot_table(index='project_name',columns='year',values='sales',aggfunc='sum')