Skip to content

Commit ab1d986

Browse files
committed
preprocess upload csv file in order to get valid rdf file
1 parent d3a4645 commit ab1d986

File tree

2 files changed

+101
-0
lines changed

2 files changed

+101
-0
lines changed

config.yml

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# folder for input
2+
InputFolder: '../data/input/'
3+
# folder for output
4+
OutputFolder: '../data/output/'
5+
# columns to extract
6+
Columns:
7+
- 'ID'
8+
- 'Prctr'
9+
- 'Bezeichnung'
10+
- 'Jahr'
11+
- 'Produktbereich'
12+
- 'Kostenart'
13+
- 'Betrag'
14+
- 'Budget_Richtung'
15+
# name of the column contains id of the row
16+
Column_ID_Name: 'ID'

src/transform.py

+85
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
# /user/bin/python3
2+
from os import listdir, path, mkdir
3+
import csv
4+
import yaml
5+
6+
7+
def load_conf(path_to_config="../config.yml"):
8+
"""
9+
load configuration file
10+
:param path_to_config:
11+
:return:
12+
"""
13+
try:
14+
with open(path_to_config, 'r') as config:
15+
return yaml.load(config)
16+
except IOError:
17+
print('Failed to load %s' % path_to_config)
18+
19+
20+
def clean_column_value(value):
21+
"""
22+
clean values in the csv file in case the generation of invalid .nt file for pipelines
23+
:param value: value to be clean
24+
:return: cleaned value
25+
"""
26+
return value.replace('"', ' ').strip()
27+
28+
29+
def extract_columns(file, columns_name, id_column_name):
30+
"""
31+
extract values from csv file with corresponding column name
32+
:param file:
33+
:param columns_name:
34+
:param id_column_name:
35+
:return: dictionary, consists of columns name and corresponding value
36+
"""
37+
columns = {}
38+
for column_name in columns_name:
39+
columns[column_name] = []
40+
with open(file) as csv_file:
41+
reader = csv.DictReader(csv_file)
42+
for index, row in enumerate(reader):
43+
columns[id_column_name].append(index)
44+
for column_name in columns.keys():
45+
if column_name != id_column_name:
46+
try:
47+
columns[column_name].append(clean_column_value(row[column_name]))
48+
except ValueError:
49+
raise 'Key {} does not exist in %s'.format(column_name, file)
50+
return columns
51+
52+
53+
def write_file(output_folder, file, columns):
54+
with open(output_folder + file, 'w') as csv_file:
55+
writer = csv.writer(csv_file)
56+
writer.writerow(columns.keys())
57+
rows = zip(*columns.values())
58+
for row in rows:
59+
writer.writerow(row)
60+
61+
62+
def transform(output_folder, columns_name, id_column_name, file):
63+
"""
64+
transform input csv file with valid format as RDF
65+
:param output_folder: output folder for transformed file
66+
:param columns_name: name of columns to be transformed
67+
:param id_column_name: identifier column name
68+
:param file: file to be transformed
69+
:return:
70+
"""
71+
columns = extract_columns(file=file, columns_name=columns_name, id_column_name=id_column_name)
72+
write_file(output_folder, path.basename(file), columns)
73+
74+
75+
if __name__ == '__main__':
76+
cfg = load_conf()
77+
input_folder, output_folder, columns_name, id_column_name, files =\
78+
cfg['InputFolder'], cfg['OutputFolder'], cfg['Columns'], cfg['Column_ID_Name'], None
79+
try:
80+
files = [input_folder + file for file in listdir(input_folder) if file.endswith('.csv')]
81+
mkdir(output_folder)
82+
except Exception as e:
83+
print(e)
84+
for file in files:
85+
transform(output_folder=output_folder, columns_name=columns_name, id_column_name=id_column_name, file=file)

0 commit comments

Comments
 (0)