import pandas as pd def read_largest_column(file_path: str) -> int: csv_file = file_path largest_column_count = 0 with open(csv_file, 'r') as temp_f: lines = temp_f.readlines() for l in lines: column_count = len(l.split(',')) + 1 # 找到列数最多的行 largest_column_count = column_count if largest_column_count < column_count else largest_column_count temp_f.close() # colunm_names为最大列数展开 return largest_column_count def cut_csv_to_model(file_path: str, pd_names=None): df = pd.read_csv(file_path, names=pd_names) # 获取文件总行数 row_num = len(df) # 确定每个小文件要包含的数据量 step = 1000 for start in range(0, row_num, step): stop = start + step filename = f"cut/{file_path.split('.')[0]}_{start}-{stop}.csv" d = df[start: stop] print("Saving file : " + filename + ", data size : " + str(len(d))) d.to_csv(filename, index=None) file_path = "CICNTTor_browsing_raw.csv" # file_path = "cut/CICNTTor_browsing_raw_0-1000.csv" column_count = read_largest_column(file_path) column_names = [i for i in range(0, column_count)] # cut_csv_to_model(file_path, pd_names=column_names) df = pd.read_csv(file_path, header=None, delimiter=',', names=column_names) df_l = df[df.columns[:8]] df_r = df[df.columns[8:]] df_r_l1 = df_r.shift(periods=-1,axis=1) df_result = df_r_l1 - df_r df_result = pd.concat([df_l,df_result],axis=1) # df_result = df_result.to_csv('CICNTTor_browsing_diff.csv', index=False)