add:diff_csv

main
yulonger's Desktop 2 years ago
parent 5d62ee9ddd
commit db15a87f9e

File diff suppressed because one or more lines are too long

@ -0,0 +1,47 @@
import pandas as pd
def read_largest_column(file_path: str) -> int:
csv_file = file_path
largest_column_count = 0
with open(csv_file, 'r') as temp_f:
lines = temp_f.readlines()
for l in lines:
column_count = len(l.split(',')) + 1
# 找到列数最多的行
largest_column_count = column_count if largest_column_count < column_count else largest_column_count
temp_f.close()
# colunm_names为最大列数展开
return largest_column_count
def cut_csv_to_model(file_path: str, pd_names=None):
df = pd.read_csv(file_path, names=pd_names)
# 获取文件总行数
row_num = len(df)
# 确定每个小文件要包含的数据量
step = 1000
for start in range(0, row_num, step):
stop = start + step
filename = f"cut/{file_path.split('.')[0]}_{start}-{stop}.csv"
d = df[start: stop]
print("Saving file : " + filename + ", data size : " + str(len(d)))
d.to_csv(filename, index=None)
file_path = "CICNTTor_browsing_raw.csv"
# file_path = "cut/CICNTTor_browsing_raw_0-1000.csv"
column_count = read_largest_column(file_path)
column_names = [i for i in range(0, column_count)]
# cut_csv_to_model(file_path, pd_names=column_names)
df = pd.read_csv(file_path, header=None, delimiter=',', names=column_names)
df_l = df[df.columns[:8]]
df_r = df[df.columns[8:]]
df_r_l1 = df_r.shift(periods=-1,axis=1)
df_result = df_r_l1 - df_r
df_result = pd.concat([df_l,df_result],axis=1)
# df_result =
df_result.to_csv('CICNTTor_browsing_diff.csv', index=False)
Loading…
Cancel
Save