-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathsplit_csv_data.py
105 lines (101 loc) · 2.4 KB
/
split_csv_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#!/usr/bin/env python3
"""Split in train and test a .csv."""
import argparse
import logging
import sys
from pytoda.data_splitter import csv_data_splitter
# setting up logging
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logger = logging.getLogger('split_csv_data')
# define the parser arguments
parser = argparse.ArgumentParser()
parser.add_argument(
'-f',
'--filepaths',
nargs='+',
help='list of files to use to generate the splits',
required=True,
)
parser.add_argument(
'-o',
'--output_path',
type=str,
help='output path where to store the splits',
required=True,
)
parser.add_argument(
'-d',
'--data_type',
type=str,
help='data type, used to generate the output file names',
required=True,
)
parser.add_argument(
'-m',
'--mode',
type=str,
help='strategy used to split the data',
choices=['random', 'file'],
required=True,
)
parser.add_argument(
'-s',
'--seed',
type=int,
help=('seed used by the random generator. ' 'Defaults to 42'),
default=42,
)
parser.add_argument(
'-t',
'--test_fraction',
type=float,
help=('portion of samples in testing data. ' 'Defaults to 0.1'),
default=0.1,
)
parser.add_argument(
'-n',
'--number_of_columns',
type=int,
help=('number of columns used to generate a hash. ' 'Defaults to 12'),
default=12,
)
parser.add_argument(
'-i',
'--index_col',
type=int,
help=('index column in the .csv flies. ' 'Defaults to 0'),
default=0,
)
parser.add_argument(
'--separator',
type=str,
help=('separators in the .csv files. ' 'Defaults to ","'),
default=',',
)
parser.add_argument(
'--header',
type=int,
help=('header row in the .csv files. ' 'Defaults to 0'),
default=0,
)
if __name__ == '__main__':
# parse arguments
args = parser.parse_args()
# run the split
train_filepath, test_filepath = csv_data_splitter(
data_filepaths=args.filepaths,
save_path=args.output_path,
data_type=args.data_type,
mode=args.mode,
seed=args.seed,
test_fraction=args.test_fraction,
number_of_columns=args.number_of_columns,
index_col=args.index_col,
sep=args.separator,
header=args.header,
)
logger.info(
'Data splitted into train_filepath={} and test_filepath={}.'.format(
train_filepath, test_filepath
)
)