add:diff_csv
parent
5d62ee9ddd
commit
db15a87f9e
File diff suppressed because one or more lines are too long
@ -0,0 +1,47 @@
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def read_largest_column(file_path: str) -> int:
|
||||
csv_file = file_path
|
||||
largest_column_count = 0
|
||||
with open(csv_file, 'r') as temp_f:
|
||||
lines = temp_f.readlines()
|
||||
for l in lines:
|
||||
column_count = len(l.split(',')) + 1
|
||||
# 找到列数最多的行
|
||||
largest_column_count = column_count if largest_column_count < column_count else largest_column_count
|
||||
temp_f.close()
|
||||
# colunm_names为最大列数展开
|
||||
|
||||
return largest_column_count
|
||||
|
||||
|
||||
def cut_csv_to_model(file_path: str, pd_names=None):
|
||||
df = pd.read_csv(file_path, names=pd_names)
|
||||
# 获取文件总行数
|
||||
row_num = len(df)
|
||||
# 确定每个小文件要包含的数据量
|
||||
step = 1000
|
||||
for start in range(0, row_num, step):
|
||||
stop = start + step
|
||||
filename = f"cut/{file_path.split('.')[0]}_{start}-{stop}.csv"
|
||||
d = df[start: stop]
|
||||
print("Saving file : " + filename + ", data size : " + str(len(d)))
|
||||
d.to_csv(filename, index=None)
|
||||
|
||||
|
||||
file_path = "CICNTTor_browsing_raw.csv"
|
||||
# file_path = "cut/CICNTTor_browsing_raw_0-1000.csv"
|
||||
column_count = read_largest_column(file_path)
|
||||
column_names = [i for i in range(0, column_count)]
|
||||
|
||||
# cut_csv_to_model(file_path, pd_names=column_names)
|
||||
|
||||
df = pd.read_csv(file_path, header=None, delimiter=',', names=column_names)
|
||||
df_l = df[df.columns[:8]]
|
||||
df_r = df[df.columns[8:]]
|
||||
df_r_l1 = df_r.shift(periods=-1,axis=1)
|
||||
df_result = df_r_l1 - df_r
|
||||
df_result = pd.concat([df_l,df_result],axis=1)
|
||||
# df_result =
|
||||
df_result.to_csv('CICNTTor_browsing_diff.csv', index=False)
|
Loading…
Reference in New Issue