From 699d4385cba3d1ab1bd115506441931902bbf063 Mon Sep 17 00:00:00 2001
From: carla <carla@local-it.org>
Date: Thu, 15 Jan 2026 10:09:23 +0100
Subject: [PATCH 1/5] chore: add  nimble_csv dependency

---
 mix.exs  | 3 ++-
 mix.lock | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/mix.exs b/mix.exs
index 4092b7a..99f4507 100644
--- a/mix.exs
+++ b/mix.exs
@@ -78,7 +78,8 @@ defmodule Mv.MixProject do
       {:credo, "~> 1.7", only: [:dev, :test], runtime: false},
       {:picosat_elixir, "~> 0.1", only: [:dev, :test]},
       {:ecto_commons, "~> 0.3"},
-      {:slugify, "~> 1.3"}
+      {:slugify, "~> 1.3"},
+      {:nimble_csv, "~> 1.0"}
     ]
   end
 
diff --git a/mix.lock b/mix.lock
index 54b0e51..26aa555 100644
--- a/mix.lock
+++ b/mix.lock
@@ -47,6 +47,7 @@
   "mime": {:hex, :mime, "2.0.7", "b8d739037be7cd402aee1ba0306edfdef982687ee7e9859bee6198c1e7e2f128", [:mix], [], "hexpm", "6171188e399ee16023ffc5b76ce445eb6d9672e2e241d2df6050f3c771e80ccd"},
   "mint": {:hex, :mint, "1.7.1", "113fdb2b2f3b59e47c7955971854641c61f378549d73e829e1768de90fc1abf1", [:mix], [{:castore, "~> 0.1.0 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: true]}, {:hpax, "~> 0.1.1 or ~> 0.2.0 or ~> 1.0", [hex: :hpax, repo: "hexpm", optional: false]}], "hexpm", "fceba0a4d0f24301ddee3024ae116df1c3f4bb7a563a731f45fdfeb9d39a231b"},
   "mix_audit": {:hex, :mix_audit, "2.1.5", "c0f77cee6b4ef9d97e37772359a187a166c7a1e0e08b50edf5bf6959dfe5a016", [:make, :mix], [{:jason, "~> 1.4", [hex: :jason, repo: "hexpm", optional: false]}, {:yaml_elixir, "~> 2.11", [hex: :yaml_elixir, repo: "hexpm", optional: false]}], "hexpm", "87f9298e21da32f697af535475860dc1d3617a010e0b418d2ec6142bc8b42d69"},
+  "nimble_csv": {:hex, :nimble_csv, "1.3.0", "b7f998dc62b222bce9596e46f028c7a5af04cb5dde6df2ea197c583227c54971", [:mix], [], "hexpm", "41ccdc18f7c8f8bb06e84164fc51635321e80d5a3b450761c4997d620925d619"},
   "nimble_options": {:hex, :nimble_options, "1.1.1", "e3a492d54d85fc3fd7c5baf411d9d2852922f66e69476317787a7b2bb000a61b", [:mix], [], "hexpm", "821b2470ca9442c4b6984882fe9bb0389371b8ddec4d45a9504f00a66f650b44"},
   "nimble_pool": {:hex, :nimble_pool, "1.1.0", "bf9c29fbdcba3564a8b800d1eeb5a3c58f36e1e11d7b7fb2e084a643f645f06b", [:mix], [], "hexpm", "af2e4e6b34197db81f7aad230c1118eac993acc0dae6bc83bac0126d4ae0813a"},
   "owl": {:hex, :owl, "0.13.0", "26010e066d5992774268f3163506972ddac0a7e77bfe57fa42a250f24d6b876e", [:mix], [{:ucwidth, "~> 0.2", [hex: :ucwidth, repo: "hexpm", optional: true]}], "hexpm", "59bf9d11ce37a4db98f57cb68fbfd61593bf419ec4ed302852b6683d3d2f7475"},

From 68e19bea1836011e3f6cf547261bfaa1fa3f95ee Mon Sep 17 00:00:00 2001
From: carla <carla@local-it.org>
Date: Thu, 15 Jan 2026 10:10:02 +0100
Subject: [PATCH 2/5] feat: add csv parser

---
 lib/mv/membership/import/csv_parser.ex | 276 +++++++++++++++++++++++++
 1 file changed, 276 insertions(+)
 create mode 100644 lib/mv/membership/import/csv_parser.ex

diff --git a/lib/mv/membership/import/csv_parser.ex b/lib/mv/membership/import/csv_parser.ex
new file mode 100644
index 0000000..b9f036d
--- /dev/null
+++ b/lib/mv/membership/import/csv_parser.ex
@@ -0,0 +1,276 @@
+NimbleCSV.define(Mv.Membership.Import.CsvParserSemicolon, separator: ";", escape: "\"")
+NimbleCSV.define(Mv.Membership.Import.CsvParserComma, separator: ",", escape: "\"")
+
+defmodule Mv.Membership.Import.CsvParser do
+  @moduledoc """
+  CSV parser with BOM handling, delimiter auto-detection, and physical line numbering.
+
+  Guarantees:
+  - UTF-8 BOM is stripped (Excel)
+  - Delimiter auto-detected (semicolon/comma) using NimbleCSV parsing (quote-aware)
+  - Returns rows tagged with their *physical start line number* in the CSV file (1-based)
+  - Skips completely empty rows (but preserves numbering by using physical line numbers)
+  - Handles `\\r\\n`, `\\n`, `\\r`
+  - Correct even when fields contain newlines inside quotes: the row gets the start line number
+  """
+
+  @utf8_bom <<0xEF, 0xBB, 0xBF>>
+  @quote ?"
+
+  @type line_number :: pos_integer()
+  @type row :: [String.t()]
+  @type numbered_row :: {line_number(), row()}
+
+  @spec parse(binary()) :: {:ok, row(), [numbered_row()]} | {:error, String.t()}
+  def parse(file_content) when is_binary(file_content) do
+    content =
+      file_content
+      |> strip_bom()
+      |> normalize_line_endings()
+
+    with :ok <- validate_content_not_empty(content),
+         :ok <- check_balanced_quotes(content),
+         {:ok, header_record, data_records} <- extract_header_and_data(content),
+         :ok <- validate_header_not_empty(header_record),
+         {:ok, headers, rows} <- parse_csv_records(header_record, data_records) do
+      {:ok, headers, rows}
+    else
+      {:error, reason} -> {:error, reason}
+    end
+  end
+
+  def parse(_), do: {:error, "Invalid CSV content"}
+
+  @spec validate_content_not_empty(binary()) :: :ok | {:error, String.t()}
+  defp validate_content_not_empty(content) do
+    if String.trim(content) == "" do
+      {:error, "CSV file is empty"}
+    else
+      :ok
+    end
+  end
+
+  @spec extract_header_and_data(binary()) :: {:ok, binary(), [{line_number(), binary()}]} | {:error, String.t()}
+  defp extract_header_and_data(content) do
+    records = split_records_with_line_numbers(content)
+
+    case records do
+      [] ->
+        {:error, "CSV file is empty"}
+
+      [{_line1, header_record} | data_records] ->
+        {:ok, header_record, data_records}
+    end
+  end
+
+  @spec validate_header_not_empty(binary()) :: :ok | {:error, String.t()}
+  defp validate_header_not_empty(header_record) do
+    if String.trim(header_record) == "" do
+      {:error, "CSV file has no header row"}
+    else
+      :ok
+    end
+  end
+
+  @spec parse_csv_records(binary(), [{line_number(), binary()}]) ::
+          {:ok, row(), [numbered_row()]} | {:error, String.t()}
+  defp parse_csv_records(header_record, data_records) do
+    delimiter = detect_delimiter_by_parsing(header_record)
+    parser = get_parser(delimiter)
+
+    with {:ok, headers} <-
+           parse_single_record(parser, header_record, "CSV file has no header row"),
+         {:ok, rows} <- parse_data_records(parser, data_records) do
+      {:ok, headers, rows}
+    end
+  end
+
+  @spec strip_bom(binary()) :: binary()
+  defp strip_bom(<<@utf8_bom, rest::binary>>), do: rest
+  defp strip_bom(content), do: content
+
+  @spec normalize_line_endings(binary()) :: binary()
+  defp normalize_line_endings(content) do
+    content
+    |> String.replace("\r\n", "\n")
+    |> String.replace("\r", "\n")
+  end
+
+  @spec get_parser(String.t()) :: module()
+  defp get_parser(";"), do: Mv.Membership.Import.CsvParserSemicolon
+  defp get_parser(","), do: Mv.Membership.Import.CsvParserComma
+  defp get_parser(_), do: Mv.Membership.Import.CsvParserSemicolon
+
+  # --- Delimiter detection (quote-aware by actually parsing the header) ---
+
+  @spec detect_delimiter_by_parsing(binary()) :: String.t()
+  defp detect_delimiter_by_parsing(header_record) do
+    semicolon_score = header_field_count(Mv.Membership.Import.CsvParserSemicolon, header_record)
+    comma_score = header_field_count(Mv.Membership.Import.CsvParserComma, header_record)
+
+    # prefer ";" on tie
+    if semicolon_score >= comma_score, do: ";", else: ","
+  end
+
+  @spec header_field_count(module(), binary()) :: non_neg_integer()
+  defp header_field_count(parser, header_record) do
+    case parse_single_record(parser, header_record, nil) do
+      {:ok, fields} -> Enum.count(fields, &(String.trim(&1) != ""))
+      {:error, _} -> 0
+    end
+  end
+
+  # Parses exactly one record (string without trailing newline is fine).
+  # Returns {:ok, row} or {:error, reason}.
+  @spec parse_single_record(module(), binary(), String.t() | nil) ::
+          {:ok, row()} | {:error, String.t()}
+  defp parse_single_record(parser, record, error_reason_if_empty) do
+    try do
+      # NimbleCSV is happiest if there's a newline at the end.
+      rows = parser.parse_string(ensure_trailing_newline(record), skip_headers: false)
+
+      case rows do
+        [row] when is_list(row) and row != [] ->
+          {:ok, row}
+
+        _ ->
+          if is_binary(error_reason_if_empty),
+            do: {:error, error_reason_if_empty},
+            else: {:error, "Failed to parse CSV header"}
+      end
+    rescue
+      e ->
+        {:error, "Failed to parse CSV: #{Exception.message(e)}"}
+    end
+  end
+
+  @spec ensure_trailing_newline(binary()) :: binary()
+  defp ensure_trailing_newline(str) do
+    if String.ends_with?(str, "\n"), do: str, else: str <> "\n"
+  end
+
+  # --- Data parsing preserving *physical* line numbers ---
+
+  @spec parse_data_records(module(), [{line_number(), binary()}]) ::
+          {:ok, [numbered_row()]} | {:error, String.t()}
+  defp parse_data_records(parser, data_records) do
+    try do
+      rows =
+        data_records
+        |> Enum.reduce([], fn {line_no, record}, acc ->
+          case String.trim(record) do
+            "" ->
+              acc
+
+            _ ->
+              parsed = parser.parse_string(ensure_trailing_newline(record), skip_headers: false)
+
+              case parsed do
+                [row] when is_list(row) ->
+                  if empty_row?(row) do
+                    acc
+                  else
+                    [{line_no, row} | acc]
+                  end
+
+                # empty row or unparsable row -> treat as skipped empty row
+                _ ->
+                  acc
+              end
+          end
+        end)
+        |> Enum.reverse()
+
+      {:ok, rows}
+    rescue
+      e ->
+        {:error, "Failed to parse CSV: #{Exception.message(e)}"}
+    end
+  end
+
+  @spec empty_row?(row()) :: boolean()
+  defp empty_row?(row) when is_list(row) do
+    Enum.all?(row, fn field -> String.trim(field) == "" end)
+  end
+
+  # Check if quotes are balanced in the content.
+  #
+  # This is a simple check that counts quote characters. In CSV, escaped quotes
+  # are represented as "", so an odd number of quotes indicates unbalanced quotes.
+  @spec check_balanced_quotes(binary()) :: :ok | {:error, String.t()}
+  defp check_balanced_quotes(content) do
+    quote_count = content |> :binary.bin_to_list() |> Enum.count(&(&1 == @quote))
+
+    if rem(quote_count, 2) != 0 do
+      {:error, "Unbalanced quotes in CSV file"}
+    else
+      :ok
+    end
+  end
+
+  # --- Record splitting with correct line numbers (quote-aware) ---
+  #
+  # This splits the CSV into "records" separated by newline *outside quotes*,
+  # returning [{start_line_number, record_string_without_newline}, ...]
+  #
+  # Line numbers are 1-based and represent the physical line in the CSV file.
+  # Empty lines are included in the numbering (they're just skipped later).
+  #
+  @spec split_records_with_line_numbers(binary()) :: [{line_number(), binary()}]
+  defp split_records_with_line_numbers(content) do
+    {acc, buf, _in_quotes, _line, start_line} =
+      do_split(content, [], [], false, 1, 1)
+
+    # finalize last record only if there is buffered content
+    acc =
+      case buf do
+        [] ->
+          acc
+
+        _ ->
+          record = buf |> Enum.reverse() |> :erlang.list_to_binary()
+          [{start_line, record} | acc]
+      end
+
+    Enum.reverse(acc)
+  end
+
+  # EOF
+  @spec do_split(
+          binary(),
+          [{line_number(), binary()}],
+          [byte()],
+          boolean(),
+          line_number(),
+          line_number()
+        ) ::
+          {[{line_number(), binary()}], [byte()], boolean(), line_number(), line_number()}
+  defp do_split(<<>>, acc, buf, in_quotes, line, start_line),
+    do: {acc, buf, in_quotes, line, start_line}
+
+  # Escaped quote inside quoted field: "" -> keep both quotes, do NOT toggle in_quotes
+  defp do_split(<<@quote, @quote, rest::binary>>, acc, buf, true = in_quotes, line, start_line) do
+    do_split(rest, acc, [@quote, @quote | buf], in_quotes, line, start_line)
+  end
+
+  # Quote toggles quote state (when not escaped "")
+  defp do_split(<<@quote, rest::binary>>, acc, buf, in_quotes, line, start_line) do
+    do_split(rest, acc, [@quote | buf], not in_quotes, line, start_line)
+  end
+
+  # Newline outside quotes ends a record (even if empty)
+  defp do_split(<<"\n", rest::binary>>, acc, buf, false, line, start_line) do
+    record = buf |> Enum.reverse() |> :erlang.list_to_binary()
+    do_split(rest, [{start_line, record} | acc], [], false, line + 1, line + 1)
+  end
+
+  # Newline inside quotes is part of the record, but advances physical line counter
+  defp do_split(<<"\n", rest::binary>>, acc, buf, true = in_quotes, line, start_line) do
+    do_split(rest, acc, [?\n | buf], in_quotes, line + 1, start_line)
+  end
+
+  # Any other byte
+  defp do_split(<<ch, rest::binary>>, acc, buf, in_quotes, line, start_line) do
+    do_split(rest, acc, [ch | buf], in_quotes, line, start_line)
+  end
+end

From 31cf07c071f36f72b875a1eabbd985ed804985b2 Mon Sep 17 00:00:00 2001
From: carla <carla@local-it.org>
Date: Thu, 15 Jan 2026 10:10:14 +0100
Subject: [PATCH 3/5] test: updated tests

---
 test/mv/membership/import/csv_parser_test.exs | 191 ++++++++++++++++++
 test/mv/membership/import/member_csv_test.exs |   1 -
 2 files changed, 191 insertions(+), 1 deletion(-)
 create mode 100644 test/mv/membership/import/csv_parser_test.exs

diff --git a/test/mv/membership/import/csv_parser_test.exs b/test/mv/membership/import/csv_parser_test.exs
new file mode 100644
index 0000000..539e57e
--- /dev/null
+++ b/test/mv/membership/import/csv_parser_test.exs
@@ -0,0 +1,191 @@
+defmodule Mv.Membership.Import.CsvParserTest do
+  use ExUnit.Case, async: true
+
+  alias Mv.Membership.Import.CsvParser
+
+  describe "parse/1" do
+    test "returns {:ok, headers, rows} for valid CSV with semicolon delimiter" do
+      csv_content = "email;first_name\njohn@example.com;John"
+
+      assert {:ok, headers, rows} = CsvParser.parse(csv_content)
+
+      assert headers == ["email", "first_name"]
+      assert rows == [{2, ["john@example.com", "John"]}]
+    end
+
+    test "returns {:ok, headers, rows} for valid CSV with comma delimiter" do
+      csv_content = "email,first_name\njohn@example.com,John"
+
+      assert {:ok, headers, rows} = CsvParser.parse(csv_content)
+
+      assert headers == ["email", "first_name"]
+      assert rows == [{2, ["john@example.com", "John"]}]
+    end
+
+    test "detects semicolon delimiter when both delimiters present" do
+      csv_content = "email;first_name,last_name\njohn@example.com;John,Doe"
+
+      assert {:ok, headers, rows} = CsvParser.parse(csv_content)
+
+      # Should detect semicolon as primary delimiter
+      assert length(headers) >= 2
+      assert length(rows) == 1
+    end
+
+    test "prefers semicolon delimiter when recognition is tied" do
+      # CSV where both delimiters would yield same number of fields
+      csv_content = "email;name\njohn@example.com;John"
+
+      assert {:ok, headers, rows} = CsvParser.parse(csv_content)
+
+      # Should prefer semicolon
+      assert headers == ["email", "name"]
+      assert rows == [{2, ["john@example.com", "John"]}]
+    end
+
+    test "defaults to semicolon delimiter when no headers recognized" do
+      csv_content = "unknown1;unknown2\nvalue1;value2"
+
+      assert {:ok, headers, rows} = CsvParser.parse(csv_content)
+
+      # Should default to semicolon
+      assert headers == ["unknown1", "unknown2"]
+      assert rows == [{2, ["value1", "value2"]}]
+    end
+  end
+
+  describe "BOM handling" do
+    test "strips UTF-8 BOM from file content" do
+      bom = <<0xEF, 0xBB, 0xBF>>
+      csv_content = bom <> "email;first_name\njohn@example.com;John"
+
+      assert {:ok, headers, rows} = CsvParser.parse(csv_content)
+
+      assert headers == ["email", "first_name"]
+      assert rows == [{2, ["john@example.com", "John"]}]
+    end
+
+    test "parses CSV with BOM correctly (Excel export compatibility)" do
+      bom = <<0xEF, 0xBB, 0xBF>>
+
+      csv_content =
+        bom <>
+          "email;first_name;last_name\njohn@example.com;John;Doe\njane@example.com;Jane;Smith"
+
+      assert {:ok, headers, rows} = CsvParser.parse(csv_content)
+
+      assert headers == ["email", "first_name", "last_name"]
+      assert length(rows) == 2
+      assert Enum.at(rows, 0) == {2, ["john@example.com", "John", "Doe"]}
+      assert Enum.at(rows, 1) == {3, ["jane@example.com", "Jane", "Smith"]}
+    end
+  end
+
+  describe "line number handling" do
+    test "header row is line 1, first data row is line 2" do
+      csv_content = "email\njohn@example.com"
+
+      assert {:ok, headers, rows} = CsvParser.parse(csv_content)
+
+      assert headers == ["email"]
+      assert rows == [{2, ["john@example.com"]}]
+    end
+
+    test "preserves correct line numbers when empty lines are skipped" do
+      csv_content = "email;first_name\n\njohn@example.com;John\n\njane@example.com;Jane"
+
+      assert {:ok, headers, rows} = CsvParser.parse(csv_content)
+
+      assert headers == ["email", "first_name"]
+      # Line 2 is empty (skipped), line 3 has data
+      assert Enum.at(rows, 0) == {3, ["john@example.com", "John"]}
+      # Line 4 is empty (skipped), line 5 has data
+      assert Enum.at(rows, 1) == {5, ["jane@example.com", "Jane"]}
+    end
+
+    test "skips completely empty rows but preserves line numbers" do
+      csv_content = "email\n\n\njohn@example.com"
+
+      assert {:ok, headers, rows} = CsvParser.parse(csv_content)
+
+      assert headers == ["email"]
+      # Lines 2, 3, 4 are empty (skipped), line 4 has data
+      assert rows == [{4, ["john@example.com"]}]
+    end
+  end
+
+  describe "line ending handling" do
+    test "handles \\r\\n line endings correctly" do
+      csv_content = "email;first_name\r\njohn@example.com;John\r\njane@example.com;Jane"
+
+      assert {:ok, headers, rows} = CsvParser.parse(csv_content)
+
+      assert headers == ["email", "first_name"]
+      assert length(rows) == 2
+      assert Enum.at(rows, 0) == {2, ["john@example.com", "John"]}
+      assert Enum.at(rows, 1) == {3, ["jane@example.com", "Jane"]}
+    end
+
+    test "handles \\n line endings correctly" do
+      csv_content = "email;first_name\njohn@example.com;John\njane@example.com;Jane"
+
+      assert {:ok, headers, rows} = CsvParser.parse(csv_content)
+
+      assert headers == ["email", "first_name"]
+      assert length(rows) == 2
+      assert Enum.at(rows, 0) == {2, ["john@example.com", "John"]}
+      assert Enum.at(rows, 1) == {3, ["jane@example.com", "Jane"]}
+    end
+  end
+
+  describe "quoted fields" do
+    test "parses quoted fields correctly" do
+      csv_content = "email;name\njohn@example.com;\"John Doe\""
+
+      assert {:ok, headers, rows} = CsvParser.parse(csv_content)
+
+      assert headers == ["email", "name"]
+      assert rows == [{2, ["john@example.com", "John Doe"]}]
+    end
+
+    test "handles escaped quotes (\"\") inside quoted fields" do
+      csv_content = "email;name\njohn@example.com;\"John \"\"Johnny\"\" Doe\""
+
+      assert {:ok, headers, rows} = CsvParser.parse(csv_content)
+
+      assert headers == ["email", "name"]
+      assert rows == [{2, ["john@example.com", "John \"Johnny\" Doe"]}]
+    end
+  end
+
+  describe "error handling" do
+    test "returns {:error, reason} for empty file" do
+      assert {:error, reason} = CsvParser.parse("")
+      assert reason =~ "empty"
+    end
+
+    test "returns {:error, reason} when no header row found" do
+      # Only whitespace after BOM strip
+      assert {:error, reason} = CsvParser.parse("   \n  ")
+      assert reason =~ "CSV file is empty"
+    end
+
+    test "returns {:error, reason} for invalid CSV format" do
+      # Unbalanced quotes
+      csv_content = "email;name\n\"john@example.com;John"
+
+      assert {:error, reason} = CsvParser.parse(csv_content)
+      assert is_binary(reason)
+    end
+  end
+
+  describe "module documentation" do
+    test "module has @moduledoc" do
+      assert Code.ensure_loaded?(CsvParser)
+
+      {:docs_v1, _, _, _, %{"en" => moduledoc}, _, _} = Code.fetch_docs(CsvParser)
+      assert is_binary(moduledoc)
+      assert String.length(moduledoc) > 0
+    end
+  end
+end
diff --git a/test/mv/membership/import/member_csv_test.exs b/test/mv/membership/import/member_csv_test.exs
index 5ac5311..e918afd 100644
--- a/test/mv/membership/import/member_csv_test.exs
+++ b/test/mv/membership/import/member_csv_test.exs
@@ -44,7 +44,6 @@ defmodule Mv.Membership.Import.MemberCSVTest do
       assert match?({:ok, _}, result) or match?({:error, _}, result)
     end
 
-    # Skip until Issue #3 is implemented
     @tag :skip
     test "returns {:ok, import_state} on success" do
       file_content = "email\njohn@example.com"

From 3bbe9895eebb5d8a7e4ad7ef4b6942412e5b6eba Mon Sep 17 00:00:00 2001
From: carla <carla@local-it.org>
Date: Thu, 15 Jan 2026 11:08:22 +0100
Subject: [PATCH 4/5] fix: improve CSV parser error handling

---
 lib/mv/membership/import/csv_parser.ex        | 53 ++++++++-----------
 test/mv/membership/import/csv_parser_test.exs | 32 +++++++++--
 2 files changed, 51 insertions(+), 34 deletions(-)

diff --git a/lib/mv/membership/import/csv_parser.ex b/lib/mv/membership/import/csv_parser.ex
index b9f036d..22a7496 100644
--- a/lib/mv/membership/import/csv_parser.ex
+++ b/lib/mv/membership/import/csv_parser.ex
@@ -23,13 +23,9 @@ defmodule Mv.Membership.Import.CsvParser do
 
   @spec parse(binary()) :: {:ok, row(), [numbered_row()]} | {:error, String.t()}
   def parse(file_content) when is_binary(file_content) do
-    content =
-      file_content
-      |> strip_bom()
-      |> normalize_line_endings()
-
-    with :ok <- validate_content_not_empty(content),
-         :ok <- check_balanced_quotes(content),
+    with :ok <- validate_utf8(file_content),
+         content <- file_content |> strip_bom() |> normalize_line_endings(),
+         :ok <- validate_content_not_empty(content),
          {:ok, header_record, data_records} <- extract_header_and_data(content),
          :ok <- validate_header_not_empty(header_record),
          {:ok, headers, rows} <- parse_csv_records(header_record, data_records) do
@@ -41,6 +37,15 @@ defmodule Mv.Membership.Import.CsvParser do
 
   def parse(_), do: {:error, "Invalid CSV content"}
 
+  @spec validate_utf8(binary()) :: :ok | {:error, String.t()}
+  defp validate_utf8(content) do
+    if String.valid?(content) do
+      :ok
+    else
+      {:error, "CSV must be valid UTF-8"}
+    end
+  end
+
   @spec validate_content_not_empty(binary()) :: :ok | {:error, String.t()}
   defp validate_content_not_empty(content) do
     if String.trim(content) == "" do
@@ -157,10 +162,10 @@ defmodule Mv.Membership.Import.CsvParser do
     try do
       rows =
         data_records
-        |> Enum.reduce([], fn {line_no, record}, acc ->
+        |> Enum.reduce_while([], fn {line_no, record}, acc ->
           case String.trim(record) do
             "" ->
-              acc
+              {:cont, acc}
 
             _ ->
               parsed = parser.parse_string(ensure_trailing_newline(record), skip_headers: false)
@@ -168,20 +173,23 @@ defmodule Mv.Membership.Import.CsvParser do
               case parsed do
                 [row] when is_list(row) ->
                   if empty_row?(row) do
-                    acc
+                    {:cont, acc}
                   else
-                    [{line_no, row} | acc]
+                    {:cont, [{line_no, row} | acc]}
                   end
 
-                # empty row or unparsable row -> treat as skipped empty row
+                # unparsable row -> return error with line number
                 _ ->
-                  acc
+                  snippet = String.slice(record, 0, min(50, String.length(record)))
+                  {:halt, {:error, "Failed to parse CSV data at line #{line_no}: #{inspect(snippet)}"}}
               end
           end
         end)
-        |> Enum.reverse()
 
-      {:ok, rows}
+      case rows do
+        {:error, reason} -> {:error, reason}
+        rows -> {:ok, Enum.reverse(rows)}
+      end
     rescue
       e ->
         {:error, "Failed to parse CSV: #{Exception.message(e)}"}
@@ -193,21 +201,6 @@ defmodule Mv.Membership.Import.CsvParser do
     Enum.all?(row, fn field -> String.trim(field) == "" end)
   end
 
-  # Check if quotes are balanced in the content.
-  #
-  # This is a simple check that counts quote characters. In CSV, escaped quotes
-  # are represented as "", so an odd number of quotes indicates unbalanced quotes.
-  @spec check_balanced_quotes(binary()) :: :ok | {:error, String.t()}
-  defp check_balanced_quotes(content) do
-    quote_count = content |> :binary.bin_to_list() |> Enum.count(&(&1 == @quote))
-
-    if rem(quote_count, 2) != 0 do
-      {:error, "Unbalanced quotes in CSV file"}
-    else
-      :ok
-    end
-  end
-
   # --- Record splitting with correct line numbers (quote-aware) ---
   #
   # This splits the CSV into "records" separated by newline *outside quotes*,
diff --git a/test/mv/membership/import/csv_parser_test.exs b/test/mv/membership/import/csv_parser_test.exs
index 539e57e..e4a762b 100644
--- a/test/mv/membership/import/csv_parser_test.exs
+++ b/test/mv/membership/import/csv_parser_test.exs
@@ -109,7 +109,7 @@ defmodule Mv.Membership.Import.CsvParserTest do
       assert {:ok, headers, rows} = CsvParser.parse(csv_content)
 
       assert headers == ["email"]
-      # Lines 2, 3, 4 are empty (skipped), line 4 has data
+      # Lines 2 & 3 are empty (skipped), line 4 has data
       assert rows == [{4, ["john@example.com"]}]
     end
   end
@@ -156,6 +156,19 @@ defmodule Mv.Membership.Import.CsvParserTest do
       assert headers == ["email", "name"]
       assert rows == [{2, ["john@example.com", "John \"Johnny\" Doe"]}]
     end
+
+    test "handles multiline quoted fields with correct line numbering" do
+      # Header line 1
+      # Data record starts line 2, contains "foo\nbar" in a field
+      # Record ends physically at line 3
+      # Expected: row gets line number 2 (start line)
+      csv_content = "email;description\njohn@example.com;\"foo\nbar\""
+
+      assert {:ok, headers, rows} = CsvParser.parse(csv_content)
+
+      assert headers == ["email", "description"]
+      assert rows == [{2, ["john@example.com", "foo\nbar"]}]
+    end
   end
 
   describe "error handling" do
@@ -170,12 +183,23 @@ defmodule Mv.Membership.Import.CsvParserTest do
       assert reason =~ "CSV file is empty"
     end
 
-    test "returns {:error, reason} for invalid CSV format" do
-      # Unbalanced quotes
-      csv_content = "email;name\n\"john@example.com;John"
+    test "returns {:error, reason} for invalid UTF-8 content" do
+      # Invalid UTF-8 sequence
+      invalid_utf8 = <<0xFF, 0xFE, 0xFD>>
+
+      assert {:error, reason} = CsvParser.parse(invalid_utf8)
+      assert reason =~ "UTF-8"
+    end
+
+    test "returns {:error, reason} for unparsable data row" do
+      # Malformed CSV row that cannot be parsed
+      # NimbleCSV will throw an exception for unclosed quotes
+      csv_content = "email;name\njohn@example.com;\"unclosed quote"
 
       assert {:error, reason} = CsvParser.parse(csv_content)
       assert is_binary(reason)
+      # Error message should indicate parsing failure
+      assert reason =~ "parse" or reason =~ "CSV"
     end
   end
 

From 8a5d012895903c458836291a3e85f83c6ccdfc98 Mon Sep 17 00:00:00 2001
From: carla <carla@local-it.org>
Date: Thu, 15 Jan 2026 12:15:22 +0100
Subject: [PATCH 5/5] refactor parser

---
 lib/mv/membership/import/csv_parser.ex | 136 +++++++++++++++----------
 1 file changed, 81 insertions(+), 55 deletions(-)

diff --git a/lib/mv/membership/import/csv_parser.ex b/lib/mv/membership/import/csv_parser.ex
index 22a7496..2de75ee 100644
--- a/lib/mv/membership/import/csv_parser.ex
+++ b/lib/mv/membership/import/csv_parser.ex
@@ -16,6 +16,7 @@ defmodule Mv.Membership.Import.CsvParser do
 
   @utf8_bom <<0xEF, 0xBB, 0xBF>>
   @quote ?"
+  @max_error_snippet_length 50
 
   @type line_number :: pos_integer()
   @type row :: [String.t()]
@@ -27,11 +28,8 @@ defmodule Mv.Membership.Import.CsvParser do
          content <- file_content |> strip_bom() |> normalize_line_endings(),
          :ok <- validate_content_not_empty(content),
          {:ok, header_record, data_records} <- extract_header_and_data(content),
-         :ok <- validate_header_not_empty(header_record),
-         {:ok, headers, rows} <- parse_csv_records(header_record, data_records) do
-      {:ok, headers, rows}
-    else
-      {:error, reason} -> {:error, reason}
+         :ok <- validate_header_not_empty(header_record) do
+      parse_csv_records(header_record, data_records)
     end
   end
 
@@ -55,7 +53,8 @@ defmodule Mv.Membership.Import.CsvParser do
     end
   end
 
-  @spec extract_header_and_data(binary()) :: {:ok, binary(), [{line_number(), binary()}]} | {:error, String.t()}
+  @spec extract_header_and_data(binary()) ::
+          {:ok, binary(), [{line_number(), binary()}]} | {:error, String.t()}
   defp extract_header_and_data(content) do
     records = split_records_with_line_numbers(content)
 
@@ -126,27 +125,25 @@ defmodule Mv.Membership.Import.CsvParser do
   end
 
   # Parses exactly one record (string without trailing newline is fine).
-  # Returns {:ok, row} or {:error, reason}.
+  # Returns `{:ok, row}` or `{:error, reason}`.
   @spec parse_single_record(module(), binary(), String.t() | nil) ::
           {:ok, row()} | {:error, String.t()}
   defp parse_single_record(parser, record, error_reason_if_empty) do
-    try do
-      # NimbleCSV is happiest if there's a newline at the end.
-      rows = parser.parse_string(ensure_trailing_newline(record), skip_headers: false)
+    # NimbleCSV is happiest if there's a newline at the end.
+    rows = parser.parse_string(ensure_trailing_newline(record), skip_headers: false)
 
-      case rows do
-        [row] when is_list(row) and row != [] ->
-          {:ok, row}
+    case rows do
+      [row] when is_list(row) and row != [] ->
+        {:ok, row}
 
-        _ ->
-          if is_binary(error_reason_if_empty),
-            do: {:error, error_reason_if_empty},
-            else: {:error, "Failed to parse CSV header"}
-      end
-    rescue
-      e ->
-        {:error, "Failed to parse CSV: #{Exception.message(e)}"}
+      _ ->
+        if is_binary(error_reason_if_empty),
+          do: {:error, error_reason_if_empty},
+          else: {:error, "Failed to parse CSV header"}
     end
+  rescue
+    e ->
+      {:error, "Failed to parse CSV: #{Exception.message(e)}"}
   end
 
   @spec ensure_trailing_newline(binary()) :: binary()
@@ -155,44 +152,59 @@ defmodule Mv.Membership.Import.CsvParser do
   end
 
   # --- Data parsing preserving *physical* line numbers ---
-
+  #
+  # Parses data records while preserving physical line numbers.
+  # Skips empty rows but maintains correct line numbering for error reporting.
+  #
   @spec parse_data_records(module(), [{line_number(), binary()}]) ::
           {:ok, [numbered_row()]} | {:error, String.t()}
   defp parse_data_records(parser, data_records) do
-    try do
-      rows =
-        data_records
-        |> Enum.reduce_while([], fn {line_no, record}, acc ->
-          case String.trim(record) do
-            "" ->
-              {:cont, acc}
+    rows =
+      data_records
+      |> Enum.reduce_while([], fn {line_no, record}, acc ->
+        process_data_record(parser, line_no, record, acc)
+      end)
 
-            _ ->
-              parsed = parser.parse_string(ensure_trailing_newline(record), skip_headers: false)
+    case rows do
+      {:error, reason} -> {:error, reason}
+      rows -> {:ok, Enum.reverse(rows)}
+    end
+  rescue
+    e ->
+      {:error, "Failed to parse CSV: #{Exception.message(e)}"}
+  end
 
-              case parsed do
-                [row] when is_list(row) ->
-                  if empty_row?(row) do
-                    {:cont, acc}
-                  else
-                    {:cont, [{line_no, row} | acc]}
-                  end
+  @spec process_data_record(module(), line_number(), binary(), [numbered_row()]) ::
+          {:cont, [numbered_row()]} | {:halt, {:error, String.t()}}
+  defp process_data_record(parser, line_no, record, acc) do
+    trimmed = String.trim(record)
 
-                # unparsable row -> return error with line number
-                _ ->
-                  snippet = String.slice(record, 0, min(50, String.length(record)))
-                  {:halt, {:error, "Failed to parse CSV data at line #{line_no}: #{inspect(snippet)}"}}
-              end
-          end
-        end)
+    if trimmed == "" do
+      {:cont, acc}
+    else
+      process_non_empty_record(parser, line_no, record, acc)
+    end
+  end
 
-      case rows do
-        {:error, reason} -> {:error, reason}
-        rows -> {:ok, Enum.reverse(rows)}
-      end
-    rescue
-      e ->
-        {:error, "Failed to parse CSV: #{Exception.message(e)}"}
+  @spec process_non_empty_record(module(), line_number(), binary(), [numbered_row()]) ::
+          {:cont, [numbered_row()]} | {:halt, {:error, String.t()}}
+  defp process_non_empty_record(parser, line_no, record, acc) do
+    parsed = parser.parse_string(ensure_trailing_newline(record), skip_headers: false)
+
+    case parsed do
+      [row] when is_list(row) ->
+        if empty_row?(row) do
+          {:cont, acc}
+        else
+          {:cont, [{line_no, row} | acc]}
+        end
+
+      # unparsable row -> return error with line number
+      _ ->
+        snippet =
+          String.slice(record, 0, min(@max_error_snippet_length, String.length(record)))
+
+        {:halt, {:error, "Failed to parse CSV data at line #{line_no}: #{inspect(snippet)}"}}
     end
   end
 
@@ -203,8 +215,8 @@ defmodule Mv.Membership.Import.CsvParser do
 
   # --- Record splitting with correct line numbers (quote-aware) ---
   #
-  # This splits the CSV into "records" separated by newline *outside quotes*,
-  # returning [{start_line_number, record_string_without_newline}, ...]
+  # Splits the CSV into records separated by newline *outside quotes*.
+  # Returns `[{start_line_number, record_string_without_newline}, ...]`.
   #
   # Line numbers are 1-based and represent the physical line in the CSV file.
   # Empty lines are included in the numbering (they're just skipped later).
@@ -228,7 +240,21 @@ defmodule Mv.Membership.Import.CsvParser do
     Enum.reverse(acc)
   end
 
-  # EOF
+  # Recursively splits CSV content into records with correct line numbering.
+  #
+  # Handles quote-aware parsing:
+  # - Escaped quotes (`""`) inside quoted fields are preserved
+  # - Newlines inside quotes are part of the record but advance line counter
+  # - Newlines outside quotes end a record
+  #
+  # Parameters:
+  # - `content` - Remaining binary content to parse
+  # - `acc` - Accumulated records `[{line_number, record}, ...]`
+  # - `buf` - Current record buffer (reversed byte list)
+  # - `in_quotes` - Whether we're currently inside a quoted field
+  # - `line` - Current physical line number
+  # - `start_line` - Line number where current record started
+  #
   @spec do_split(
           binary(),
           [{line_number(), binary()}],