Pandas笔记--表格利器

处理表格数据常用的pandas库,mark一下常用的指令和技巧。

索引方式
1
2
3
4
5
6
7
8
9
10
11
12
13
# DataFrame数据
df.columns # 获取列索引名称
df.values # 获取数值(ndarray)
df.iloc[:, 1:3] # 取某列series元素
df.columns.values[0] # 设置列名称
y = train_data.pop('30').values # pop将训练数据中的标签值y取出来
col = train_data.columns
x = train_data[col].values

# Series数据(单列数据)
sr.index # 获取行索引
sr.name # 设置列名称
sr.values # 获取数值(ndarray)
标准化
1
2
3
# MinMax标准化
df = df.iloc[:, 1:]
dat = (df - df.min()) / (df.max() - df.min())
分类和回归交差验证框架
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
## classification
kf = RepeatedStratifiedKFold(n_splits=kf)
i = 0
scores = np.zeros([kf*10, 2])
for train_idx, test_idx in kf.split(label_X, label_Y):
y_train, y_test = label_Y[train_idx], label_Y[test_idx]

# select features
model = ExtraTreesClassifier(n_estimators=100)
model.fit(label_X.iloc[train_idx, :], y_train)
fim = model.feature_importances_
b = np.array(sorted(enumerate(fim), key=lambda x: x[1]))
idx = b[:80, 0].astype(int)

x_train, x_test = label_X.iloc[train_idx, idx], label_X.iloc[test_idx, idx]

# KNN classification
neigh = KNeighborsClassifier(n_neighbors=3).fit(x_train, y_train)
y_pred = neigh.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
scores[i, 0] = np.mean(np.diag(cm) / np.sum(cm, axis=1))
scores[i, 1] = f1_score(y_test, y_pred, average='weighted')

i = i + 1

print(np.mean(scores, axis=0), np.std(scores, axis=0))

## regression
scores = np.zeros([10, 2])
for i in range(10):
x_train, x_test, y_train, y_test = train_test_split(label_X, label_Y, test_size=0.3)

# SVR regression
svr_reg = SVR(kernel='linear')
svr_reg.fit(x_train, y_train)
y_pred = svr_reg.predict(x_test)
scores[i, 0] = mean_squared_error(y_test, y_pred)
scores[i, 1] = mean_absolute_error(y_test, y_pred)

print(np.mean(scores, axis=0), np.std(scores, axis=0))