import os
import numpy as np
from pathlib import Path
from sklearn import svm
from sklearn import preprocessing
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
#数据存放地址
arr_path = Path('D:/机器学习第三次作业/data')#路径分隔符号/or\\
#获取子文件路径
l = list(arr_path.iterdir())#遍历子文件路径
# print(l)
all_file_list = []#用list存放npy文件的地址
all_label_list = []#用list存放npy文件的标签,标签为子文件的名称(建议给子文件夹取名为0,1,2,...)
for i in l:
print(f'i==={i}')
temp_file_list = list(i.glob("**/*.npy"))#返回所有npy文件的地址的list
temp_label_list = [int(i.name)]*len(temp_file_list)#生成list[],[1]*3=[1,1,1]以此类推
print(f'{int(i.name)}含有样本数量:{len(temp_file_list)}')
print(temp_file_list)
#子文件路径
all_file_list += temp_file_list
#子文件标签
all_label_list += temp_label_list
# folder_list.append(str(i.name))
all_data = list(zip(all_file_list,all_label_list))
# print(all_data[:10])
data_x = []#用list存放npy文件的数值变量
data_y = []#用list存放label
#按照路径读取numpy数组
for i in range(len(all_data)):
arr = np.load(all_data[i][0])
'''
# print(arr.shape)
# print(all_data[i][1])
# if i >10:
# break
# data_x.append(arr.reshape(5,224,224))
'''
data_x.append(arr)
data_y.append(all_data[i][1])
# print(data_x[:5],data_y[:5])
#svm需要的数据类型为numpy.array,上面用的是list存放数据,所以要把list变成array
data_feature = np.array(data_x)
data_label = np.array(data_y)
data_feature = data_feature.reshape(1060,-1)
# 数据归一(可以比较一下归一化对分类结果是否有影响)
data_feature = preprocessing.MinMaxScaler().fit_transform(data_feature)
X_train, X_test, y_train, y_test = train_test_split(data_feature, data_label, test_size=0.3, random_state=1000)
predictor = svm.SVC(gamma='scale', C=1.0,max_iter = 1000)
# 进行训练
predictor.fit(X_train, y_train)
#
predictions_labels = predictor.predict(X_train)
print(classification_report(y_train, predictions_labels))
predictions_labels = predictor.predict(X_test)
print(classification_report(y_test, predictions_labels))