From 3bbe9895eebb5d8a7e4ad7ef4b6942412e5b6eba Mon Sep 17 00:00:00 2001 From: carla Date: Thu, 15 Jan 2026 11:08:22 +0100 Subject: [PATCH] fix: improve CSV parser error handling --- lib/mv/membership/import/csv_parser.ex | 53 ++++++++----------- test/mv/membership/import/csv_parser_test.exs | 32 +++++++++-- 2 files changed, 51 insertions(+), 34 deletions(-) diff --git a/lib/mv/membership/import/csv_parser.ex b/lib/mv/membership/import/csv_parser.ex index b9f036d..22a7496 100644 --- a/lib/mv/membership/import/csv_parser.ex +++ b/lib/mv/membership/import/csv_parser.ex @@ -23,13 +23,9 @@ defmodule Mv.Membership.Import.CsvParser do @spec parse(binary()) :: {:ok, row(), [numbered_row()]} | {:error, String.t()} def parse(file_content) when is_binary(file_content) do - content = - file_content - |> strip_bom() - |> normalize_line_endings() - - with :ok <- validate_content_not_empty(content), - :ok <- check_balanced_quotes(content), + with :ok <- validate_utf8(file_content), + content <- file_content |> strip_bom() |> normalize_line_endings(), + :ok <- validate_content_not_empty(content), {:ok, header_record, data_records} <- extract_header_and_data(content), :ok <- validate_header_not_empty(header_record), {:ok, headers, rows} <- parse_csv_records(header_record, data_records) do @@ -41,6 +37,15 @@ defmodule Mv.Membership.Import.CsvParser do def parse(_), do: {:error, "Invalid CSV content"} + @spec validate_utf8(binary()) :: :ok | {:error, String.t()} + defp validate_utf8(content) do + if String.valid?(content) do + :ok + else + {:error, "CSV must be valid UTF-8"} + end + end + @spec validate_content_not_empty(binary()) :: :ok | {:error, String.t()} defp validate_content_not_empty(content) do if String.trim(content) == "" do @@ -157,10 +162,10 @@ defmodule Mv.Membership.Import.CsvParser do try do rows = data_records - |> Enum.reduce([], fn {line_no, record}, acc -> + |> Enum.reduce_while([], fn {line_no, record}, acc -> case String.trim(record) do "" -> - acc + {:cont, acc} _ -> parsed = parser.parse_string(ensure_trailing_newline(record), skip_headers: false) @@ -168,20 +173,23 @@ defmodule Mv.Membership.Import.CsvParser do case parsed do [row] when is_list(row) -> if empty_row?(row) do - acc + {:cont, acc} else - [{line_no, row} | acc] + {:cont, [{line_no, row} | acc]} end - # empty row or unparsable row -> treat as skipped empty row + # unparsable row -> return error with line number _ -> - acc + snippet = String.slice(record, 0, min(50, String.length(record))) + {:halt, {:error, "Failed to parse CSV data at line #{line_no}: #{inspect(snippet)}"}} end end end) - |> Enum.reverse() - {:ok, rows} + case rows do + {:error, reason} -> {:error, reason} + rows -> {:ok, Enum.reverse(rows)} + end rescue e -> {:error, "Failed to parse CSV: #{Exception.message(e)}"} @@ -193,21 +201,6 @@ defmodule Mv.Membership.Import.CsvParser do Enum.all?(row, fn field -> String.trim(field) == "" end) end - # Check if quotes are balanced in the content. - # - # This is a simple check that counts quote characters. In CSV, escaped quotes - # are represented as "", so an odd number of quotes indicates unbalanced quotes. - @spec check_balanced_quotes(binary()) :: :ok | {:error, String.t()} - defp check_balanced_quotes(content) do - quote_count = content |> :binary.bin_to_list() |> Enum.count(&(&1 == @quote)) - - if rem(quote_count, 2) != 0 do - {:error, "Unbalanced quotes in CSV file"} - else - :ok - end - end - # --- Record splitting with correct line numbers (quote-aware) --- # # This splits the CSV into "records" separated by newline *outside quotes*, diff --git a/test/mv/membership/import/csv_parser_test.exs b/test/mv/membership/import/csv_parser_test.exs index 539e57e..e4a762b 100644 --- a/test/mv/membership/import/csv_parser_test.exs +++ b/test/mv/membership/import/csv_parser_test.exs @@ -109,7 +109,7 @@ defmodule Mv.Membership.Import.CsvParserTest do assert {:ok, headers, rows} = CsvParser.parse(csv_content) assert headers == ["email"] - # Lines 2, 3, 4 are empty (skipped), line 4 has data + # Lines 2 & 3 are empty (skipped), line 4 has data assert rows == [{4, ["john@example.com"]}] end end @@ -156,6 +156,19 @@ defmodule Mv.Membership.Import.CsvParserTest do assert headers == ["email", "name"] assert rows == [{2, ["john@example.com", "John \"Johnny\" Doe"]}] end + + test "handles multiline quoted fields with correct line numbering" do + # Header line 1 + # Data record starts line 2, contains "foo\nbar" in a field + # Record ends physically at line 3 + # Expected: row gets line number 2 (start line) + csv_content = "email;description\njohn@example.com;\"foo\nbar\"" + + assert {:ok, headers, rows} = CsvParser.parse(csv_content) + + assert headers == ["email", "description"] + assert rows == [{2, ["john@example.com", "foo\nbar"]}] + end end describe "error handling" do @@ -170,12 +183,23 @@ defmodule Mv.Membership.Import.CsvParserTest do assert reason =~ "CSV file is empty" end - test "returns {:error, reason} for invalid CSV format" do - # Unbalanced quotes - csv_content = "email;name\n\"john@example.com;John" + test "returns {:error, reason} for invalid UTF-8 content" do + # Invalid UTF-8 sequence + invalid_utf8 = <<0xFF, 0xFE, 0xFD>> + + assert {:error, reason} = CsvParser.parse(invalid_utf8) + assert reason =~ "UTF-8" + end + + test "returns {:error, reason} for unparsable data row" do + # Malformed CSV row that cannot be parsed + # NimbleCSV will throw an exception for unclosed quotes + csv_content = "email;name\njohn@example.com;\"unclosed quote" assert {:error, reason} = CsvParser.parse(csv_content) assert is_binary(reason) + # Error message should indicate parsing failure + assert reason =~ "parse" or reason =~ "CSV" end end