You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
48 lines
1.5 KiB
Python
48 lines
1.5 KiB
Python
import pandas as pd
|
|
|
|
|
|
def read_largest_column(file_path: str) -> int:
|
|
csv_file = file_path
|
|
largest_column_count = 0
|
|
with open(csv_file, 'r') as temp_f:
|
|
lines = temp_f.readlines()
|
|
for l in lines:
|
|
column_count = len(l.split(',')) + 1
|
|
# 找到列数最多的行
|
|
largest_column_count = column_count if largest_column_count < column_count else largest_column_count
|
|
temp_f.close()
|
|
# colunm_names为最大列数展开
|
|
|
|
return largest_column_count
|
|
|
|
|
|
def cut_csv_to_model(file_path: str, pd_names=None):
|
|
df = pd.read_csv(file_path, names=pd_names)
|
|
# 获取文件总行数
|
|
row_num = len(df)
|
|
# 确定每个小文件要包含的数据量
|
|
step = 1000
|
|
for start in range(0, row_num, step):
|
|
stop = start + step
|
|
filename = f"cut/{file_path.split('.')[0]}_{start}-{stop}.csv"
|
|
d = df[start: stop]
|
|
print("Saving file : " + filename + ", data size : " + str(len(d)))
|
|
d.to_csv(filename, index=None)
|
|
|
|
|
|
file_path = "CICNTTor_browsing_raw.csv"
|
|
# file_path = "cut/CICNTTor_browsing_raw_0-1000.csv"
|
|
column_count = read_largest_column(file_path)
|
|
column_names = [i for i in range(0, column_count)]
|
|
|
|
# cut_csv_to_model(file_path, pd_names=column_names)
|
|
|
|
df = pd.read_csv(file_path, header=None, delimiter=',', names=column_names)
|
|
df_l = df[df.columns[:8]]
|
|
df_r = df[df.columns[8:]]
|
|
df_r_l1 = df_r.shift(periods=-1,axis=1)
|
|
df_result = df_r_l1 - df_r
|
|
df_result = pd.concat([df_l,df_result],axis=1)
|
|
# df_result =
|
|
df_result.to_csv('CICNTTor_browsing_diff.csv', index=False)
|