[GSProcessing] Change default quoting and parsing for CSV input (#1169)

*Issue #, if available:* *Description of changes:* * Change the default Spark CSV parsing settings to follow RFC 4180 and match Pandas default By submitting this pull request, I confirm that you can use, modify, copy, and redistribute this contribution, under the terms of your choice.
awslabs · Feb 15, 2025 · 4289130 · 4289130
1 parent ea6dff4
commit 4289130
Showing 1 changed file with 8 additions and 1 deletion.
diff --git a/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py b/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py
@@ -1076,7 +1076,14 @@ def process_node_data(self, node_configs: Sequence[NodeConfig]) -> Dict:
             if node_config.format == "csv":
                 separator = node_config.separator
                 node_file_schema = schema_utils.parse_node_file_schema(node_config)
-                nodes_df_untyped = self.spark.read.csv(path=file_paths, sep=separator, header=True)
+                # Adjust reading of CSV files to follow RFC 4180
+                # https://www.ietf.org/rfc/rfc4180.txt
+                nodes_df_untyped = (
+                    self.spark.read.option("quote", '"')
+                    .option("escape", '"')
+                    .option("multiLine", "true")
+                    .csv(path=file_paths, sep=separator, header=True)
+                )
                 nodes_df_untyped = nodes_df_untyped.select(node_file_schema.fieldNames())
                 # Select only the columns referenced in the config
                 # and cast each column to the correct type