|
| 1 | +# /user/bin/python3 |
| 2 | +from os import listdir, path, mkdir |
| 3 | +import csv |
| 4 | +import yaml |
| 5 | + |
| 6 | + |
| 7 | +def load_conf(path_to_config="../config.yml"): |
| 8 | + """ |
| 9 | + load configuration file |
| 10 | + :param path_to_config: |
| 11 | + :return: |
| 12 | + """ |
| 13 | + try: |
| 14 | + with open(path_to_config, 'r') as config: |
| 15 | + return yaml.load(config) |
| 16 | + except IOError: |
| 17 | + print('Failed to load %s' % path_to_config) |
| 18 | + |
| 19 | + |
| 20 | +def clean_column_value(value): |
| 21 | + """ |
| 22 | + clean values in the csv file in case the generation of invalid .nt file for pipelines |
| 23 | + :param value: value to be clean |
| 24 | + :return: cleaned value |
| 25 | + """ |
| 26 | + return value.replace('"', ' ').strip() |
| 27 | + |
| 28 | + |
| 29 | +def extract_columns(file, columns_name, id_column_name): |
| 30 | + """ |
| 31 | + extract values from csv file with corresponding column name |
| 32 | + :param file: |
| 33 | + :param columns_name: |
| 34 | + :param id_column_name: |
| 35 | + :return: dictionary, consists of columns name and corresponding value |
| 36 | + """ |
| 37 | + columns = {} |
| 38 | + for column_name in columns_name: |
| 39 | + columns[column_name] = [] |
| 40 | + with open(file) as csv_file: |
| 41 | + reader = csv.DictReader(csv_file) |
| 42 | + for index, row in enumerate(reader): |
| 43 | + columns[id_column_name].append(index) |
| 44 | + for column_name in columns.keys(): |
| 45 | + if column_name != id_column_name: |
| 46 | + try: |
| 47 | + columns[column_name].append(clean_column_value(row[column_name])) |
| 48 | + except ValueError: |
| 49 | + raise 'Key {} does not exist in %s'.format(column_name, file) |
| 50 | + return columns |
| 51 | + |
| 52 | + |
| 53 | +def write_file(output_folder, file, columns): |
| 54 | + with open(output_folder + file, 'w') as csv_file: |
| 55 | + writer = csv.writer(csv_file) |
| 56 | + writer.writerow(columns.keys()) |
| 57 | + rows = zip(*columns.values()) |
| 58 | + for row in rows: |
| 59 | + writer.writerow(row) |
| 60 | + |
| 61 | + |
| 62 | +def transform(output_folder, columns_name, id_column_name, file): |
| 63 | + """ |
| 64 | + transform input csv file with valid format as RDF |
| 65 | + :param output_folder: output folder for transformed file |
| 66 | + :param columns_name: name of columns to be transformed |
| 67 | + :param id_column_name: identifier column name |
| 68 | + :param file: file to be transformed |
| 69 | + :return: |
| 70 | + """ |
| 71 | + columns = extract_columns(file=file, columns_name=columns_name, id_column_name=id_column_name) |
| 72 | + write_file(output_folder, path.basename(file), columns) |
| 73 | + |
| 74 | + |
| 75 | +if __name__ == '__main__': |
| 76 | + cfg = load_conf() |
| 77 | + input_folder, output_folder, columns_name, id_column_name, files =\ |
| 78 | + cfg['InputFolder'], cfg['OutputFolder'], cfg['Columns'], cfg['Column_ID_Name'], None |
| 79 | + try: |
| 80 | + files = [input_folder + file for file in listdir(input_folder) if file.endswith('.csv')] |
| 81 | + mkdir(output_folder) |
| 82 | + except Exception as e: |
| 83 | + print(e) |
| 84 | + for file in files: |
| 85 | + transform(output_folder=output_folder, columns_name=columns_name, id_column_name=id_column_name, file=file) |
0 commit comments