feat: add csv parser

2026-01-15 10:10:02 +01:00 · 2026-01-15 10:10:02 +01:00 · 68e19bea18
commit 68e19bea18
parent 699d4385cb
1 changed files with 276 additions and 0 deletions
--- a/lib/mv/membership/import/csv_parser.ex
+++ b/lib/mv/membership/import/csv_parser.ex
@ -0,0 +1,276 @@
+NimbleCSV.define(Mv.Membership.Import.CsvParserSemicolon, separator: ";", escape: "\"")
+NimbleCSV.define(Mv.Membership.Import.CsvParserComma, separator: ",", escape: "\"")
+
+defmodule Mv.Membership.Import.CsvParser do
+  @moduledoc """
+  CSV parser with BOM handling, delimiter auto-detection, and physical line numbering.
+
+  Guarantees:
+  - UTF-8 BOM is stripped (Excel)
+  - Delimiter auto-detected (semicolon/comma) using NimbleCSV parsing (quote-aware)
+  - Returns rows tagged with their *physical start line number* in the CSV file (1-based)
+  - Skips completely empty rows (but preserves numbering by using physical line numbers)
+  - Handles `\\r\\n`, `\\n`, `\\r`
+  - Correct even when fields contain newlines inside quotes: the row gets the start line number
+  """
+
+  @utf8_bom <<0xEF, 0xBB, 0xBF>>
+  @quote ?"
+
+  @type line_number :: pos_integer()
+  @type row :: [String.t()]
+  @type numbered_row :: {line_number(), row()}
+
+  @spec parse(binary()) :: {:ok, row(), [numbered_row()]} | {:error, String.t()}
+  def parse(file_content) when is_binary(file_content) do
+    content =
+      file_content
+      |> strip_bom()
+      |> normalize_line_endings()
+
+    with :ok <- validate_content_not_empty(content),
+         :ok <- check_balanced_quotes(content),
+         {:ok, header_record, data_records} <- extract_header_and_data(content),
+         :ok <- validate_header_not_empty(header_record),
+         {:ok, headers, rows} <- parse_csv_records(header_record, data_records) do
+      {:ok, headers, rows}
+    else
+      {:error, reason} -> {:error, reason}
+    end
+  end
+
+  def parse(_), do: {:error, "Invalid CSV content"}
+
+  @spec validate_content_not_empty(binary()) :: :ok | {:error, String.t()}
+  defp validate_content_not_empty(content) do
+    if String.trim(content) == "" do
+      {:error, "CSV file is empty"}
+    else
+      :ok
+    end
+  end
+
+  @spec extract_header_and_data(binary()) :: {:ok, binary(), [{line_number(), binary()}]} | {:error, String.t()}
+  defp extract_header_and_data(content) do
+    records = split_records_with_line_numbers(content)
+
+    case records do
+      [] ->
+        {:error, "CSV file is empty"}
+
+      [{_line1, header_record} | data_records] ->
+        {:ok, header_record, data_records}
+    end
+  end
+
+  @spec validate_header_not_empty(binary()) :: :ok | {:error, String.t()}
+  defp validate_header_not_empty(header_record) do
+    if String.trim(header_record) == "" do
+      {:error, "CSV file has no header row"}
+    else
+      :ok
+    end
+  end
+
+  @spec parse_csv_records(binary(), [{line_number(), binary()}]) ::
+          {:ok, row(), [numbered_row()]} | {:error, String.t()}
+  defp parse_csv_records(header_record, data_records) do
+    delimiter = detect_delimiter_by_parsing(header_record)
+    parser = get_parser(delimiter)
+
+    with {:ok, headers} <-
+           parse_single_record(parser, header_record, "CSV file has no header row"),
+         {:ok, rows} <- parse_data_records(parser, data_records) do
+      {:ok, headers, rows}
+    end
+  end
+
+  @spec strip_bom(binary()) :: binary()
+  defp strip_bom(<<@utf8_bom, rest::binary>>), do: rest
+  defp strip_bom(content), do: content
+
+  @spec normalize_line_endings(binary()) :: binary()
+  defp normalize_line_endings(content) do
+    content
+    |> String.replace("\r\n", "\n")
+    |> String.replace("\r", "\n")
+  end
+
+  @spec get_parser(String.t()) :: module()
+  defp get_parser(";"), do: Mv.Membership.Import.CsvParserSemicolon
+  defp get_parser(","), do: Mv.Membership.Import.CsvParserComma
+  defp get_parser(_), do: Mv.Membership.Import.CsvParserSemicolon
+
+  # --- Delimiter detection (quote-aware by actually parsing the header) ---
+
+  @spec detect_delimiter_by_parsing(binary()) :: String.t()
+  defp detect_delimiter_by_parsing(header_record) do
+    semicolon_score = header_field_count(Mv.Membership.Import.CsvParserSemicolon, header_record)
+    comma_score = header_field_count(Mv.Membership.Import.CsvParserComma, header_record)
+
+    # prefer ";" on tie
+    if semicolon_score >= comma_score, do: ";", else: ","
+  end
+
+  @spec header_field_count(module(), binary()) :: non_neg_integer()
+  defp header_field_count(parser, header_record) do
+    case parse_single_record(parser, header_record, nil) do
+      {:ok, fields} -> Enum.count(fields, &(String.trim(&1) != ""))
+      {:error, _} -> 0
+    end
+  end
+
+  # Parses exactly one record (string without trailing newline is fine).
+  # Returns {:ok, row} or {:error, reason}.
+  @spec parse_single_record(module(), binary(), String.t() | nil) ::
+          {:ok, row()} | {:error, String.t()}
+  defp parse_single_record(parser, record, error_reason_if_empty) do
+    try do
+      # NimbleCSV is happiest if there's a newline at the end.
+      rows = parser.parse_string(ensure_trailing_newline(record), skip_headers: false)
+
+      case rows do
+        [row] when is_list(row) and row != [] ->
+          {:ok, row}
+
+        _ ->
+          if is_binary(error_reason_if_empty),
+            do: {:error, error_reason_if_empty},
+            else: {:error, "Failed to parse CSV header"}
+      end
+    rescue
+      e ->
+        {:error, "Failed to parse CSV: #{Exception.message(e)}"}
+    end
+  end
+
+  @spec ensure_trailing_newline(binary()) :: binary()
+  defp ensure_trailing_newline(str) do
+    if String.ends_with?(str, "\n"), do: str, else: str <> "\n"
+  end
+
+  # --- Data parsing preserving *physical* line numbers ---
+
+  @spec parse_data_records(module(), [{line_number(), binary()}]) ::
+          {:ok, [numbered_row()]} | {:error, String.t()}
+  defp parse_data_records(parser, data_records) do
+    try do
+      rows =
+        data_records
+        |> Enum.reduce([], fn {line_no, record}, acc ->
+          case String.trim(record) do
+            "" ->
+              acc
+
+            _ ->
+              parsed = parser.parse_string(ensure_trailing_newline(record), skip_headers: false)
+
+              case parsed do
+                [row] when is_list(row) ->
+                  if empty_row?(row) do
+                    acc
+                  else
+                    [{line_no, row} | acc]
+                  end
+
+                # empty row or unparsable row -> treat as skipped empty row
+                _ ->
+                  acc
+              end
+          end
+        end)
+        |> Enum.reverse()
+
+      {:ok, rows}
+    rescue
+      e ->
+        {:error, "Failed to parse CSV: #{Exception.message(e)}"}
+    end
+  end
+
+  @spec empty_row?(row()) :: boolean()
+  defp empty_row?(row) when is_list(row) do
+    Enum.all?(row, fn field -> String.trim(field) == "" end)
+  end
+
+  # Check if quotes are balanced in the content.
+  #
+  # This is a simple check that counts quote characters. In CSV, escaped quotes
+  # are represented as "", so an odd number of quotes indicates unbalanced quotes.
+  @spec check_balanced_quotes(binary()) :: :ok | {:error, String.t()}
+  defp check_balanced_quotes(content) do
+    quote_count = content |> :binary.bin_to_list() |> Enum.count(&(&1 == @quote))
+
+    if rem(quote_count, 2) != 0 do
+      {:error, "Unbalanced quotes in CSV file"}
+    else
+      :ok
+    end
+  end
+
+  # --- Record splitting with correct line numbers (quote-aware) ---
+  #
+  # This splits the CSV into "records" separated by newline *outside quotes*,
+  # returning [{start_line_number, record_string_without_newline}, ...]
+  #
+  # Line numbers are 1-based and represent the physical line in the CSV file.
+  # Empty lines are included in the numbering (they're just skipped later).
+  #
+  @spec split_records_with_line_numbers(binary()) :: [{line_number(), binary()}]
+  defp split_records_with_line_numbers(content) do
+    {acc, buf, _in_quotes, _line, start_line} =
+      do_split(content, [], [], false, 1, 1)
+
+    # finalize last record only if there is buffered content
+    acc =
+      case buf do
+        [] ->
+          acc
+
+        _ ->
+          record = buf |> Enum.reverse() |> :erlang.list_to_binary()
+          [{start_line, record} | acc]
+      end
+
+    Enum.reverse(acc)
+  end
+
+  # EOF
+  @spec do_split(
+          binary(),
+          [{line_number(), binary()}],
+          [byte()],
+          boolean(),
+          line_number(),
+          line_number()
+        ) ::
+          {[{line_number(), binary()}], [byte()], boolean(), line_number(), line_number()}
+  defp do_split(<<>>, acc, buf, in_quotes, line, start_line),
+    do: {acc, buf, in_quotes, line, start_line}
+
+  # Escaped quote inside quoted field: "" -> keep both quotes, do NOT toggle in_quotes
+  defp do_split(<<@quote, @quote, rest::binary>>, acc, buf, true = in_quotes, line, start_line) do
+    do_split(rest, acc, [@quote, @quote | buf], in_quotes, line, start_line)
+  end
+
+  # Quote toggles quote state (when not escaped "")
+  defp do_split(<<@quote, rest::binary>>, acc, buf, in_quotes, line, start_line) do
+    do_split(rest, acc, [@quote | buf], not in_quotes, line, start_line)
+  end
+
+  # Newline outside quotes ends a record (even if empty)
+  defp do_split(<<"\n", rest::binary>>, acc, buf, false, line, start_line) do
+    record = buf |> Enum.reverse() |> :erlang.list_to_binary()
+    do_split(rest, [{start_line, record} | acc], [], false, line + 1, line + 1)
+  end
+
+  # Newline inside quotes is part of the record, but advances physical line counter
+  defp do_split(<<"\n", rest::binary>>, acc, buf, true = in_quotes, line, start_line) do
+    do_split(rest, acc, [?\n | buf], in_quotes, line + 1, start_line)
+  end
+
+  # Any other byte
+  defp do_split(<<ch, rest::binary>>, acc, buf, in_quotes, line, start_line) do
+    do_split(rest, acc, [ch | buf], in_quotes, line, start_line)
+  end
+end