From 699d4385cba3d1ab1bd115506441931902bbf063 Mon Sep 17 00:00:00 2001 From: carla Date: Thu, 15 Jan 2026 10:09:23 +0100 Subject: [PATCH 1/5] chore: add nimble_csv dependency --- mix.exs | 3 ++- mix.lock | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/mix.exs b/mix.exs index 4092b7a..99f4507 100644 --- a/mix.exs +++ b/mix.exs @@ -78,7 +78,8 @@ defmodule Mv.MixProject do {:credo, "~> 1.7", only: [:dev, :test], runtime: false}, {:picosat_elixir, "~> 0.1", only: [:dev, :test]}, {:ecto_commons, "~> 0.3"}, - {:slugify, "~> 1.3"} + {:slugify, "~> 1.3"}, + {:nimble_csv, "~> 1.0"} ] end diff --git a/mix.lock b/mix.lock index 54b0e51..26aa555 100644 --- a/mix.lock +++ b/mix.lock @@ -47,6 +47,7 @@ "mime": {:hex, :mime, "2.0.7", "b8d739037be7cd402aee1ba0306edfdef982687ee7e9859bee6198c1e7e2f128", [:mix], [], "hexpm", "6171188e399ee16023ffc5b76ce445eb6d9672e2e241d2df6050f3c771e80ccd"}, "mint": {:hex, :mint, "1.7.1", "113fdb2b2f3b59e47c7955971854641c61f378549d73e829e1768de90fc1abf1", [:mix], [{:castore, "~> 0.1.0 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: true]}, {:hpax, "~> 0.1.1 or ~> 0.2.0 or ~> 1.0", [hex: :hpax, repo: "hexpm", optional: false]}], "hexpm", "fceba0a4d0f24301ddee3024ae116df1c3f4bb7a563a731f45fdfeb9d39a231b"}, "mix_audit": {:hex, :mix_audit, "2.1.5", "c0f77cee6b4ef9d97e37772359a187a166c7a1e0e08b50edf5bf6959dfe5a016", [:make, :mix], [{:jason, "~> 1.4", [hex: :jason, repo: "hexpm", optional: false]}, {:yaml_elixir, "~> 2.11", [hex: :yaml_elixir, repo: "hexpm", optional: false]}], "hexpm", "87f9298e21da32f697af535475860dc1d3617a010e0b418d2ec6142bc8b42d69"}, + "nimble_csv": {:hex, :nimble_csv, "1.3.0", "b7f998dc62b222bce9596e46f028c7a5af04cb5dde6df2ea197c583227c54971", [:mix], [], "hexpm", "41ccdc18f7c8f8bb06e84164fc51635321e80d5a3b450761c4997d620925d619"}, "nimble_options": {:hex, :nimble_options, "1.1.1", "e3a492d54d85fc3fd7c5baf411d9d2852922f66e69476317787a7b2bb000a61b", [:mix], [], "hexpm", "821b2470ca9442c4b6984882fe9bb0389371b8ddec4d45a9504f00a66f650b44"}, "nimble_pool": {:hex, :nimble_pool, "1.1.0", "bf9c29fbdcba3564a8b800d1eeb5a3c58f36e1e11d7b7fb2e084a643f645f06b", [:mix], [], "hexpm", "af2e4e6b34197db81f7aad230c1118eac993acc0dae6bc83bac0126d4ae0813a"}, "owl": {:hex, :owl, "0.13.0", "26010e066d5992774268f3163506972ddac0a7e77bfe57fa42a250f24d6b876e", [:mix], [{:ucwidth, "~> 0.2", [hex: :ucwidth, repo: "hexpm", optional: true]}], "hexpm", "59bf9d11ce37a4db98f57cb68fbfd61593bf419ec4ed302852b6683d3d2f7475"}, From 68e19bea1836011e3f6cf547261bfaa1fa3f95ee Mon Sep 17 00:00:00 2001 From: carla Date: Thu, 15 Jan 2026 10:10:02 +0100 Subject: [PATCH 2/5] feat: add csv parser --- lib/mv/membership/import/csv_parser.ex | 276 +++++++++++++++++++++++++ 1 file changed, 276 insertions(+) create mode 100644 lib/mv/membership/import/csv_parser.ex diff --git a/lib/mv/membership/import/csv_parser.ex b/lib/mv/membership/import/csv_parser.ex new file mode 100644 index 0000000..b9f036d --- /dev/null +++ b/lib/mv/membership/import/csv_parser.ex @@ -0,0 +1,276 @@ +NimbleCSV.define(Mv.Membership.Import.CsvParserSemicolon, separator: ";", escape: "\"") +NimbleCSV.define(Mv.Membership.Import.CsvParserComma, separator: ",", escape: "\"") + +defmodule Mv.Membership.Import.CsvParser do + @moduledoc """ + CSV parser with BOM handling, delimiter auto-detection, and physical line numbering. + + Guarantees: + - UTF-8 BOM is stripped (Excel) + - Delimiter auto-detected (semicolon/comma) using NimbleCSV parsing (quote-aware) + - Returns rows tagged with their *physical start line number* in the CSV file (1-based) + - Skips completely empty rows (but preserves numbering by using physical line numbers) + - Handles `\\r\\n`, `\\n`, `\\r` + - Correct even when fields contain newlines inside quotes: the row gets the start line number + """ + + @utf8_bom <<0xEF, 0xBB, 0xBF>> + @quote ?" + + @type line_number :: pos_integer() + @type row :: [String.t()] + @type numbered_row :: {line_number(), row()} + + @spec parse(binary()) :: {:ok, row(), [numbered_row()]} | {:error, String.t()} + def parse(file_content) when is_binary(file_content) do + content = + file_content + |> strip_bom() + |> normalize_line_endings() + + with :ok <- validate_content_not_empty(content), + :ok <- check_balanced_quotes(content), + {:ok, header_record, data_records} <- extract_header_and_data(content), + :ok <- validate_header_not_empty(header_record), + {:ok, headers, rows} <- parse_csv_records(header_record, data_records) do + {:ok, headers, rows} + else + {:error, reason} -> {:error, reason} + end + end + + def parse(_), do: {:error, "Invalid CSV content"} + + @spec validate_content_not_empty(binary()) :: :ok | {:error, String.t()} + defp validate_content_not_empty(content) do + if String.trim(content) == "" do + {:error, "CSV file is empty"} + else + :ok + end + end + + @spec extract_header_and_data(binary()) :: {:ok, binary(), [{line_number(), binary()}]} | {:error, String.t()} + defp extract_header_and_data(content) do + records = split_records_with_line_numbers(content) + + case records do + [] -> + {:error, "CSV file is empty"} + + [{_line1, header_record} | data_records] -> + {:ok, header_record, data_records} + end + end + + @spec validate_header_not_empty(binary()) :: :ok | {:error, String.t()} + defp validate_header_not_empty(header_record) do + if String.trim(header_record) == "" do + {:error, "CSV file has no header row"} + else + :ok + end + end + + @spec parse_csv_records(binary(), [{line_number(), binary()}]) :: + {:ok, row(), [numbered_row()]} | {:error, String.t()} + defp parse_csv_records(header_record, data_records) do + delimiter = detect_delimiter_by_parsing(header_record) + parser = get_parser(delimiter) + + with {:ok, headers} <- + parse_single_record(parser, header_record, "CSV file has no header row"), + {:ok, rows} <- parse_data_records(parser, data_records) do + {:ok, headers, rows} + end + end + + @spec strip_bom(binary()) :: binary() + defp strip_bom(<<@utf8_bom, rest::binary>>), do: rest + defp strip_bom(content), do: content + + @spec normalize_line_endings(binary()) :: binary() + defp normalize_line_endings(content) do + content + |> String.replace("\r\n", "\n") + |> String.replace("\r", "\n") + end + + @spec get_parser(String.t()) :: module() + defp get_parser(";"), do: Mv.Membership.Import.CsvParserSemicolon + defp get_parser(","), do: Mv.Membership.Import.CsvParserComma + defp get_parser(_), do: Mv.Membership.Import.CsvParserSemicolon + + # --- Delimiter detection (quote-aware by actually parsing the header) --- + + @spec detect_delimiter_by_parsing(binary()) :: String.t() + defp detect_delimiter_by_parsing(header_record) do + semicolon_score = header_field_count(Mv.Membership.Import.CsvParserSemicolon, header_record) + comma_score = header_field_count(Mv.Membership.Import.CsvParserComma, header_record) + + # prefer ";" on tie + if semicolon_score >= comma_score, do: ";", else: "," + end + + @spec header_field_count(module(), binary()) :: non_neg_integer() + defp header_field_count(parser, header_record) do + case parse_single_record(parser, header_record, nil) do + {:ok, fields} -> Enum.count(fields, &(String.trim(&1) != "")) + {:error, _} -> 0 + end + end + + # Parses exactly one record (string without trailing newline is fine). + # Returns {:ok, row} or {:error, reason}. + @spec parse_single_record(module(), binary(), String.t() | nil) :: + {:ok, row()} | {:error, String.t()} + defp parse_single_record(parser, record, error_reason_if_empty) do + try do + # NimbleCSV is happiest if there's a newline at the end. + rows = parser.parse_string(ensure_trailing_newline(record), skip_headers: false) + + case rows do + [row] when is_list(row) and row != [] -> + {:ok, row} + + _ -> + if is_binary(error_reason_if_empty), + do: {:error, error_reason_if_empty}, + else: {:error, "Failed to parse CSV header"} + end + rescue + e -> + {:error, "Failed to parse CSV: #{Exception.message(e)}"} + end + end + + @spec ensure_trailing_newline(binary()) :: binary() + defp ensure_trailing_newline(str) do + if String.ends_with?(str, "\n"), do: str, else: str <> "\n" + end + + # --- Data parsing preserving *physical* line numbers --- + + @spec parse_data_records(module(), [{line_number(), binary()}]) :: + {:ok, [numbered_row()]} | {:error, String.t()} + defp parse_data_records(parser, data_records) do + try do + rows = + data_records + |> Enum.reduce([], fn {line_no, record}, acc -> + case String.trim(record) do + "" -> + acc + + _ -> + parsed = parser.parse_string(ensure_trailing_newline(record), skip_headers: false) + + case parsed do + [row] when is_list(row) -> + if empty_row?(row) do + acc + else + [{line_no, row} | acc] + end + + # empty row or unparsable row -> treat as skipped empty row + _ -> + acc + end + end + end) + |> Enum.reverse() + + {:ok, rows} + rescue + e -> + {:error, "Failed to parse CSV: #{Exception.message(e)}"} + end + end + + @spec empty_row?(row()) :: boolean() + defp empty_row?(row) when is_list(row) do + Enum.all?(row, fn field -> String.trim(field) == "" end) + end + + # Check if quotes are balanced in the content. + # + # This is a simple check that counts quote characters. In CSV, escaped quotes + # are represented as "", so an odd number of quotes indicates unbalanced quotes. + @spec check_balanced_quotes(binary()) :: :ok | {:error, String.t()} + defp check_balanced_quotes(content) do + quote_count = content |> :binary.bin_to_list() |> Enum.count(&(&1 == @quote)) + + if rem(quote_count, 2) != 0 do + {:error, "Unbalanced quotes in CSV file"} + else + :ok + end + end + + # --- Record splitting with correct line numbers (quote-aware) --- + # + # This splits the CSV into "records" separated by newline *outside quotes*, + # returning [{start_line_number, record_string_without_newline}, ...] + # + # Line numbers are 1-based and represent the physical line in the CSV file. + # Empty lines are included in the numbering (they're just skipped later). + # + @spec split_records_with_line_numbers(binary()) :: [{line_number(), binary()}] + defp split_records_with_line_numbers(content) do + {acc, buf, _in_quotes, _line, start_line} = + do_split(content, [], [], false, 1, 1) + + # finalize last record only if there is buffered content + acc = + case buf do + [] -> + acc + + _ -> + record = buf |> Enum.reverse() |> :erlang.list_to_binary() + [{start_line, record} | acc] + end + + Enum.reverse(acc) + end + + # EOF + @spec do_split( + binary(), + [{line_number(), binary()}], + [byte()], + boolean(), + line_number(), + line_number() + ) :: + {[{line_number(), binary()}], [byte()], boolean(), line_number(), line_number()} + defp do_split(<<>>, acc, buf, in_quotes, line, start_line), + do: {acc, buf, in_quotes, line, start_line} + + # Escaped quote inside quoted field: "" -> keep both quotes, do NOT toggle in_quotes + defp do_split(<<@quote, @quote, rest::binary>>, acc, buf, true = in_quotes, line, start_line) do + do_split(rest, acc, [@quote, @quote | buf], in_quotes, line, start_line) + end + + # Quote toggles quote state (when not escaped "") + defp do_split(<<@quote, rest::binary>>, acc, buf, in_quotes, line, start_line) do + do_split(rest, acc, [@quote | buf], not in_quotes, line, start_line) + end + + # Newline outside quotes ends a record (even if empty) + defp do_split(<<"\n", rest::binary>>, acc, buf, false, line, start_line) do + record = buf |> Enum.reverse() |> :erlang.list_to_binary() + do_split(rest, [{start_line, record} | acc], [], false, line + 1, line + 1) + end + + # Newline inside quotes is part of the record, but advances physical line counter + defp do_split(<<"\n", rest::binary>>, acc, buf, true = in_quotes, line, start_line) do + do_split(rest, acc, [?\n | buf], in_quotes, line + 1, start_line) + end + + # Any other byte + defp do_split(<>, acc, buf, in_quotes, line, start_line) do + do_split(rest, acc, [ch | buf], in_quotes, line, start_line) + end +end From 31cf07c071f36f72b875a1eabbd985ed804985b2 Mon Sep 17 00:00:00 2001 From: carla Date: Thu, 15 Jan 2026 10:10:14 +0100 Subject: [PATCH 3/5] test: updated tests --- test/mv/membership/import/csv_parser_test.exs | 191 ++++++++++++++++++ test/mv/membership/import/member_csv_test.exs | 1 - 2 files changed, 191 insertions(+), 1 deletion(-) create mode 100644 test/mv/membership/import/csv_parser_test.exs diff --git a/test/mv/membership/import/csv_parser_test.exs b/test/mv/membership/import/csv_parser_test.exs new file mode 100644 index 0000000..539e57e --- /dev/null +++ b/test/mv/membership/import/csv_parser_test.exs @@ -0,0 +1,191 @@ +defmodule Mv.Membership.Import.CsvParserTest do + use ExUnit.Case, async: true + + alias Mv.Membership.Import.CsvParser + + describe "parse/1" do + test "returns {:ok, headers, rows} for valid CSV with semicolon delimiter" do + csv_content = "email;first_name\njohn@example.com;John" + + assert {:ok, headers, rows} = CsvParser.parse(csv_content) + + assert headers == ["email", "first_name"] + assert rows == [{2, ["john@example.com", "John"]}] + end + + test "returns {:ok, headers, rows} for valid CSV with comma delimiter" do + csv_content = "email,first_name\njohn@example.com,John" + + assert {:ok, headers, rows} = CsvParser.parse(csv_content) + + assert headers == ["email", "first_name"] + assert rows == [{2, ["john@example.com", "John"]}] + end + + test "detects semicolon delimiter when both delimiters present" do + csv_content = "email;first_name,last_name\njohn@example.com;John,Doe" + + assert {:ok, headers, rows} = CsvParser.parse(csv_content) + + # Should detect semicolon as primary delimiter + assert length(headers) >= 2 + assert length(rows) == 1 + end + + test "prefers semicolon delimiter when recognition is tied" do + # CSV where both delimiters would yield same number of fields + csv_content = "email;name\njohn@example.com;John" + + assert {:ok, headers, rows} = CsvParser.parse(csv_content) + + # Should prefer semicolon + assert headers == ["email", "name"] + assert rows == [{2, ["john@example.com", "John"]}] + end + + test "defaults to semicolon delimiter when no headers recognized" do + csv_content = "unknown1;unknown2\nvalue1;value2" + + assert {:ok, headers, rows} = CsvParser.parse(csv_content) + + # Should default to semicolon + assert headers == ["unknown1", "unknown2"] + assert rows == [{2, ["value1", "value2"]}] + end + end + + describe "BOM handling" do + test "strips UTF-8 BOM from file content" do + bom = <<0xEF, 0xBB, 0xBF>> + csv_content = bom <> "email;first_name\njohn@example.com;John" + + assert {:ok, headers, rows} = CsvParser.parse(csv_content) + + assert headers == ["email", "first_name"] + assert rows == [{2, ["john@example.com", "John"]}] + end + + test "parses CSV with BOM correctly (Excel export compatibility)" do + bom = <<0xEF, 0xBB, 0xBF>> + + csv_content = + bom <> + "email;first_name;last_name\njohn@example.com;John;Doe\njane@example.com;Jane;Smith" + + assert {:ok, headers, rows} = CsvParser.parse(csv_content) + + assert headers == ["email", "first_name", "last_name"] + assert length(rows) == 2 + assert Enum.at(rows, 0) == {2, ["john@example.com", "John", "Doe"]} + assert Enum.at(rows, 1) == {3, ["jane@example.com", "Jane", "Smith"]} + end + end + + describe "line number handling" do + test "header row is line 1, first data row is line 2" do + csv_content = "email\njohn@example.com" + + assert {:ok, headers, rows} = CsvParser.parse(csv_content) + + assert headers == ["email"] + assert rows == [{2, ["john@example.com"]}] + end + + test "preserves correct line numbers when empty lines are skipped" do + csv_content = "email;first_name\n\njohn@example.com;John\n\njane@example.com;Jane" + + assert {:ok, headers, rows} = CsvParser.parse(csv_content) + + assert headers == ["email", "first_name"] + # Line 2 is empty (skipped), line 3 has data + assert Enum.at(rows, 0) == {3, ["john@example.com", "John"]} + # Line 4 is empty (skipped), line 5 has data + assert Enum.at(rows, 1) == {5, ["jane@example.com", "Jane"]} + end + + test "skips completely empty rows but preserves line numbers" do + csv_content = "email\n\n\njohn@example.com" + + assert {:ok, headers, rows} = CsvParser.parse(csv_content) + + assert headers == ["email"] + # Lines 2, 3, 4 are empty (skipped), line 4 has data + assert rows == [{4, ["john@example.com"]}] + end + end + + describe "line ending handling" do + test "handles \\r\\n line endings correctly" do + csv_content = "email;first_name\r\njohn@example.com;John\r\njane@example.com;Jane" + + assert {:ok, headers, rows} = CsvParser.parse(csv_content) + + assert headers == ["email", "first_name"] + assert length(rows) == 2 + assert Enum.at(rows, 0) == {2, ["john@example.com", "John"]} + assert Enum.at(rows, 1) == {3, ["jane@example.com", "Jane"]} + end + + test "handles \\n line endings correctly" do + csv_content = "email;first_name\njohn@example.com;John\njane@example.com;Jane" + + assert {:ok, headers, rows} = CsvParser.parse(csv_content) + + assert headers == ["email", "first_name"] + assert length(rows) == 2 + assert Enum.at(rows, 0) == {2, ["john@example.com", "John"]} + assert Enum.at(rows, 1) == {3, ["jane@example.com", "Jane"]} + end + end + + describe "quoted fields" do + test "parses quoted fields correctly" do + csv_content = "email;name\njohn@example.com;\"John Doe\"" + + assert {:ok, headers, rows} = CsvParser.parse(csv_content) + + assert headers == ["email", "name"] + assert rows == [{2, ["john@example.com", "John Doe"]}] + end + + test "handles escaped quotes (\"\") inside quoted fields" do + csv_content = "email;name\njohn@example.com;\"John \"\"Johnny\"\" Doe\"" + + assert {:ok, headers, rows} = CsvParser.parse(csv_content) + + assert headers == ["email", "name"] + assert rows == [{2, ["john@example.com", "John \"Johnny\" Doe"]}] + end + end + + describe "error handling" do + test "returns {:error, reason} for empty file" do + assert {:error, reason} = CsvParser.parse("") + assert reason =~ "empty" + end + + test "returns {:error, reason} when no header row found" do + # Only whitespace after BOM strip + assert {:error, reason} = CsvParser.parse(" \n ") + assert reason =~ "CSV file is empty" + end + + test "returns {:error, reason} for invalid CSV format" do + # Unbalanced quotes + csv_content = "email;name\n\"john@example.com;John" + + assert {:error, reason} = CsvParser.parse(csv_content) + assert is_binary(reason) + end + end + + describe "module documentation" do + test "module has @moduledoc" do + assert Code.ensure_loaded?(CsvParser) + + {:docs_v1, _, _, _, %{"en" => moduledoc}, _, _} = Code.fetch_docs(CsvParser) + assert is_binary(moduledoc) + assert String.length(moduledoc) > 0 + end + end +end diff --git a/test/mv/membership/import/member_csv_test.exs b/test/mv/membership/import/member_csv_test.exs index 5ac5311..e918afd 100644 --- a/test/mv/membership/import/member_csv_test.exs +++ b/test/mv/membership/import/member_csv_test.exs @@ -44,7 +44,6 @@ defmodule Mv.Membership.Import.MemberCSVTest do assert match?({:ok, _}, result) or match?({:error, _}, result) end - # Skip until Issue #3 is implemented @tag :skip test "returns {:ok, import_state} on success" do file_content = "email\njohn@example.com" From 3bbe9895eebb5d8a7e4ad7ef4b6942412e5b6eba Mon Sep 17 00:00:00 2001 From: carla Date: Thu, 15 Jan 2026 11:08:22 +0100 Subject: [PATCH 4/5] fix: improve CSV parser error handling --- lib/mv/membership/import/csv_parser.ex | 53 ++++++++----------- test/mv/membership/import/csv_parser_test.exs | 32 +++++++++-- 2 files changed, 51 insertions(+), 34 deletions(-) diff --git a/lib/mv/membership/import/csv_parser.ex b/lib/mv/membership/import/csv_parser.ex index b9f036d..22a7496 100644 --- a/lib/mv/membership/import/csv_parser.ex +++ b/lib/mv/membership/import/csv_parser.ex @@ -23,13 +23,9 @@ defmodule Mv.Membership.Import.CsvParser do @spec parse(binary()) :: {:ok, row(), [numbered_row()]} | {:error, String.t()} def parse(file_content) when is_binary(file_content) do - content = - file_content - |> strip_bom() - |> normalize_line_endings() - - with :ok <- validate_content_not_empty(content), - :ok <- check_balanced_quotes(content), + with :ok <- validate_utf8(file_content), + content <- file_content |> strip_bom() |> normalize_line_endings(), + :ok <- validate_content_not_empty(content), {:ok, header_record, data_records} <- extract_header_and_data(content), :ok <- validate_header_not_empty(header_record), {:ok, headers, rows} <- parse_csv_records(header_record, data_records) do @@ -41,6 +37,15 @@ defmodule Mv.Membership.Import.CsvParser do def parse(_), do: {:error, "Invalid CSV content"} + @spec validate_utf8(binary()) :: :ok | {:error, String.t()} + defp validate_utf8(content) do + if String.valid?(content) do + :ok + else + {:error, "CSV must be valid UTF-8"} + end + end + @spec validate_content_not_empty(binary()) :: :ok | {:error, String.t()} defp validate_content_not_empty(content) do if String.trim(content) == "" do @@ -157,10 +162,10 @@ defmodule Mv.Membership.Import.CsvParser do try do rows = data_records - |> Enum.reduce([], fn {line_no, record}, acc -> + |> Enum.reduce_while([], fn {line_no, record}, acc -> case String.trim(record) do "" -> - acc + {:cont, acc} _ -> parsed = parser.parse_string(ensure_trailing_newline(record), skip_headers: false) @@ -168,20 +173,23 @@ defmodule Mv.Membership.Import.CsvParser do case parsed do [row] when is_list(row) -> if empty_row?(row) do - acc + {:cont, acc} else - [{line_no, row} | acc] + {:cont, [{line_no, row} | acc]} end - # empty row or unparsable row -> treat as skipped empty row + # unparsable row -> return error with line number _ -> - acc + snippet = String.slice(record, 0, min(50, String.length(record))) + {:halt, {:error, "Failed to parse CSV data at line #{line_no}: #{inspect(snippet)}"}} end end end) - |> Enum.reverse() - {:ok, rows} + case rows do + {:error, reason} -> {:error, reason} + rows -> {:ok, Enum.reverse(rows)} + end rescue e -> {:error, "Failed to parse CSV: #{Exception.message(e)}"} @@ -193,21 +201,6 @@ defmodule Mv.Membership.Import.CsvParser do Enum.all?(row, fn field -> String.trim(field) == "" end) end - # Check if quotes are balanced in the content. - # - # This is a simple check that counts quote characters. In CSV, escaped quotes - # are represented as "", so an odd number of quotes indicates unbalanced quotes. - @spec check_balanced_quotes(binary()) :: :ok | {:error, String.t()} - defp check_balanced_quotes(content) do - quote_count = content |> :binary.bin_to_list() |> Enum.count(&(&1 == @quote)) - - if rem(quote_count, 2) != 0 do - {:error, "Unbalanced quotes in CSV file"} - else - :ok - end - end - # --- Record splitting with correct line numbers (quote-aware) --- # # This splits the CSV into "records" separated by newline *outside quotes*, diff --git a/test/mv/membership/import/csv_parser_test.exs b/test/mv/membership/import/csv_parser_test.exs index 539e57e..e4a762b 100644 --- a/test/mv/membership/import/csv_parser_test.exs +++ b/test/mv/membership/import/csv_parser_test.exs @@ -109,7 +109,7 @@ defmodule Mv.Membership.Import.CsvParserTest do assert {:ok, headers, rows} = CsvParser.parse(csv_content) assert headers == ["email"] - # Lines 2, 3, 4 are empty (skipped), line 4 has data + # Lines 2 & 3 are empty (skipped), line 4 has data assert rows == [{4, ["john@example.com"]}] end end @@ -156,6 +156,19 @@ defmodule Mv.Membership.Import.CsvParserTest do assert headers == ["email", "name"] assert rows == [{2, ["john@example.com", "John \"Johnny\" Doe"]}] end + + test "handles multiline quoted fields with correct line numbering" do + # Header line 1 + # Data record starts line 2, contains "foo\nbar" in a field + # Record ends physically at line 3 + # Expected: row gets line number 2 (start line) + csv_content = "email;description\njohn@example.com;\"foo\nbar\"" + + assert {:ok, headers, rows} = CsvParser.parse(csv_content) + + assert headers == ["email", "description"] + assert rows == [{2, ["john@example.com", "foo\nbar"]}] + end end describe "error handling" do @@ -170,12 +183,23 @@ defmodule Mv.Membership.Import.CsvParserTest do assert reason =~ "CSV file is empty" end - test "returns {:error, reason} for invalid CSV format" do - # Unbalanced quotes - csv_content = "email;name\n\"john@example.com;John" + test "returns {:error, reason} for invalid UTF-8 content" do + # Invalid UTF-8 sequence + invalid_utf8 = <<0xFF, 0xFE, 0xFD>> + + assert {:error, reason} = CsvParser.parse(invalid_utf8) + assert reason =~ "UTF-8" + end + + test "returns {:error, reason} for unparsable data row" do + # Malformed CSV row that cannot be parsed + # NimbleCSV will throw an exception for unclosed quotes + csv_content = "email;name\njohn@example.com;\"unclosed quote" assert {:error, reason} = CsvParser.parse(csv_content) assert is_binary(reason) + # Error message should indicate parsing failure + assert reason =~ "parse" or reason =~ "CSV" end end From 8a5d012895903c458836291a3e85f83c6ccdfc98 Mon Sep 17 00:00:00 2001 From: carla Date: Thu, 15 Jan 2026 12:15:22 +0100 Subject: [PATCH 5/5] refactor parser --- lib/mv/membership/import/csv_parser.ex | 136 +++++++++++++++---------- 1 file changed, 81 insertions(+), 55 deletions(-) diff --git a/lib/mv/membership/import/csv_parser.ex b/lib/mv/membership/import/csv_parser.ex index 22a7496..2de75ee 100644 --- a/lib/mv/membership/import/csv_parser.ex +++ b/lib/mv/membership/import/csv_parser.ex @@ -16,6 +16,7 @@ defmodule Mv.Membership.Import.CsvParser do @utf8_bom <<0xEF, 0xBB, 0xBF>> @quote ?" + @max_error_snippet_length 50 @type line_number :: pos_integer() @type row :: [String.t()] @@ -27,11 +28,8 @@ defmodule Mv.Membership.Import.CsvParser do content <- file_content |> strip_bom() |> normalize_line_endings(), :ok <- validate_content_not_empty(content), {:ok, header_record, data_records} <- extract_header_and_data(content), - :ok <- validate_header_not_empty(header_record), - {:ok, headers, rows} <- parse_csv_records(header_record, data_records) do - {:ok, headers, rows} - else - {:error, reason} -> {:error, reason} + :ok <- validate_header_not_empty(header_record) do + parse_csv_records(header_record, data_records) end end @@ -55,7 +53,8 @@ defmodule Mv.Membership.Import.CsvParser do end end - @spec extract_header_and_data(binary()) :: {:ok, binary(), [{line_number(), binary()}]} | {:error, String.t()} + @spec extract_header_and_data(binary()) :: + {:ok, binary(), [{line_number(), binary()}]} | {:error, String.t()} defp extract_header_and_data(content) do records = split_records_with_line_numbers(content) @@ -126,27 +125,25 @@ defmodule Mv.Membership.Import.CsvParser do end # Parses exactly one record (string without trailing newline is fine). - # Returns {:ok, row} or {:error, reason}. + # Returns `{:ok, row}` or `{:error, reason}`. @spec parse_single_record(module(), binary(), String.t() | nil) :: {:ok, row()} | {:error, String.t()} defp parse_single_record(parser, record, error_reason_if_empty) do - try do - # NimbleCSV is happiest if there's a newline at the end. - rows = parser.parse_string(ensure_trailing_newline(record), skip_headers: false) + # NimbleCSV is happiest if there's a newline at the end. + rows = parser.parse_string(ensure_trailing_newline(record), skip_headers: false) - case rows do - [row] when is_list(row) and row != [] -> - {:ok, row} + case rows do + [row] when is_list(row) and row != [] -> + {:ok, row} - _ -> - if is_binary(error_reason_if_empty), - do: {:error, error_reason_if_empty}, - else: {:error, "Failed to parse CSV header"} - end - rescue - e -> - {:error, "Failed to parse CSV: #{Exception.message(e)}"} + _ -> + if is_binary(error_reason_if_empty), + do: {:error, error_reason_if_empty}, + else: {:error, "Failed to parse CSV header"} end + rescue + e -> + {:error, "Failed to parse CSV: #{Exception.message(e)}"} end @spec ensure_trailing_newline(binary()) :: binary() @@ -155,44 +152,59 @@ defmodule Mv.Membership.Import.CsvParser do end # --- Data parsing preserving *physical* line numbers --- - + # + # Parses data records while preserving physical line numbers. + # Skips empty rows but maintains correct line numbering for error reporting. + # @spec parse_data_records(module(), [{line_number(), binary()}]) :: {:ok, [numbered_row()]} | {:error, String.t()} defp parse_data_records(parser, data_records) do - try do - rows = - data_records - |> Enum.reduce_while([], fn {line_no, record}, acc -> - case String.trim(record) do - "" -> - {:cont, acc} + rows = + data_records + |> Enum.reduce_while([], fn {line_no, record}, acc -> + process_data_record(parser, line_no, record, acc) + end) - _ -> - parsed = parser.parse_string(ensure_trailing_newline(record), skip_headers: false) + case rows do + {:error, reason} -> {:error, reason} + rows -> {:ok, Enum.reverse(rows)} + end + rescue + e -> + {:error, "Failed to parse CSV: #{Exception.message(e)}"} + end - case parsed do - [row] when is_list(row) -> - if empty_row?(row) do - {:cont, acc} - else - {:cont, [{line_no, row} | acc]} - end + @spec process_data_record(module(), line_number(), binary(), [numbered_row()]) :: + {:cont, [numbered_row()]} | {:halt, {:error, String.t()}} + defp process_data_record(parser, line_no, record, acc) do + trimmed = String.trim(record) - # unparsable row -> return error with line number - _ -> - snippet = String.slice(record, 0, min(50, String.length(record))) - {:halt, {:error, "Failed to parse CSV data at line #{line_no}: #{inspect(snippet)}"}} - end - end - end) + if trimmed == "" do + {:cont, acc} + else + process_non_empty_record(parser, line_no, record, acc) + end + end - case rows do - {:error, reason} -> {:error, reason} - rows -> {:ok, Enum.reverse(rows)} - end - rescue - e -> - {:error, "Failed to parse CSV: #{Exception.message(e)}"} + @spec process_non_empty_record(module(), line_number(), binary(), [numbered_row()]) :: + {:cont, [numbered_row()]} | {:halt, {:error, String.t()}} + defp process_non_empty_record(parser, line_no, record, acc) do + parsed = parser.parse_string(ensure_trailing_newline(record), skip_headers: false) + + case parsed do + [row] when is_list(row) -> + if empty_row?(row) do + {:cont, acc} + else + {:cont, [{line_no, row} | acc]} + end + + # unparsable row -> return error with line number + _ -> + snippet = + String.slice(record, 0, min(@max_error_snippet_length, String.length(record))) + + {:halt, {:error, "Failed to parse CSV data at line #{line_no}: #{inspect(snippet)}"}} end end @@ -203,8 +215,8 @@ defmodule Mv.Membership.Import.CsvParser do # --- Record splitting with correct line numbers (quote-aware) --- # - # This splits the CSV into "records" separated by newline *outside quotes*, - # returning [{start_line_number, record_string_without_newline}, ...] + # Splits the CSV into records separated by newline *outside quotes*. + # Returns `[{start_line_number, record_string_without_newline}, ...]`. # # Line numbers are 1-based and represent the physical line in the CSV file. # Empty lines are included in the numbering (they're just skipped later). @@ -228,7 +240,21 @@ defmodule Mv.Membership.Import.CsvParser do Enum.reverse(acc) end - # EOF + # Recursively splits CSV content into records with correct line numbering. + # + # Handles quote-aware parsing: + # - Escaped quotes (`""`) inside quoted fields are preserved + # - Newlines inside quotes are part of the record but advance line counter + # - Newlines outside quotes end a record + # + # Parameters: + # - `content` - Remaining binary content to parse + # - `acc` - Accumulated records `[{line_number, record}, ...]` + # - `buf` - Current record buffer (reversed byte list) + # - `in_quotes` - Whether we're currently inside a quoted field + # - `line` - Current physical line number + # - `start_line` - Line number where current record started + # @spec do_split( binary(), [{line_number(), binary()}],