From 68e19bea1836011e3f6cf547261bfaa1fa3f95ee Mon Sep 17 00:00:00 2001 From: carla Date: Thu, 15 Jan 2026 10:10:02 +0100 Subject: [PATCH] feat: add csv parser --- lib/mv/membership/import/csv_parser.ex | 276 +++++++++++++++++++++++++ 1 file changed, 276 insertions(+) create mode 100644 lib/mv/membership/import/csv_parser.ex diff --git a/lib/mv/membership/import/csv_parser.ex b/lib/mv/membership/import/csv_parser.ex new file mode 100644 index 0000000..b9f036d --- /dev/null +++ b/lib/mv/membership/import/csv_parser.ex @@ -0,0 +1,276 @@ +NimbleCSV.define(Mv.Membership.Import.CsvParserSemicolon, separator: ";", escape: "\"") +NimbleCSV.define(Mv.Membership.Import.CsvParserComma, separator: ",", escape: "\"") + +defmodule Mv.Membership.Import.CsvParser do + @moduledoc """ + CSV parser with BOM handling, delimiter auto-detection, and physical line numbering. + + Guarantees: + - UTF-8 BOM is stripped (Excel) + - Delimiter auto-detected (semicolon/comma) using NimbleCSV parsing (quote-aware) + - Returns rows tagged with their *physical start line number* in the CSV file (1-based) + - Skips completely empty rows (but preserves numbering by using physical line numbers) + - Handles `\\r\\n`, `\\n`, `\\r` + - Correct even when fields contain newlines inside quotes: the row gets the start line number + """ + + @utf8_bom <<0xEF, 0xBB, 0xBF>> + @quote ?" + + @type line_number :: pos_integer() + @type row :: [String.t()] + @type numbered_row :: {line_number(), row()} + + @spec parse(binary()) :: {:ok, row(), [numbered_row()]} | {:error, String.t()} + def parse(file_content) when is_binary(file_content) do + content = + file_content + |> strip_bom() + |> normalize_line_endings() + + with :ok <- validate_content_not_empty(content), + :ok <- check_balanced_quotes(content), + {:ok, header_record, data_records} <- extract_header_and_data(content), + :ok <- validate_header_not_empty(header_record), + {:ok, headers, rows} <- parse_csv_records(header_record, data_records) do + {:ok, headers, rows} + else + {:error, reason} -> {:error, reason} + end + end + + def parse(_), do: {:error, "Invalid CSV content"} + + @spec validate_content_not_empty(binary()) :: :ok | {:error, String.t()} + defp validate_content_not_empty(content) do + if String.trim(content) == "" do + {:error, "CSV file is empty"} + else + :ok + end + end + + @spec extract_header_and_data(binary()) :: {:ok, binary(), [{line_number(), binary()}]} | {:error, String.t()} + defp extract_header_and_data(content) do + records = split_records_with_line_numbers(content) + + case records do + [] -> + {:error, "CSV file is empty"} + + [{_line1, header_record} | data_records] -> + {:ok, header_record, data_records} + end + end + + @spec validate_header_not_empty(binary()) :: :ok | {:error, String.t()} + defp validate_header_not_empty(header_record) do + if String.trim(header_record) == "" do + {:error, "CSV file has no header row"} + else + :ok + end + end + + @spec parse_csv_records(binary(), [{line_number(), binary()}]) :: + {:ok, row(), [numbered_row()]} | {:error, String.t()} + defp parse_csv_records(header_record, data_records) do + delimiter = detect_delimiter_by_parsing(header_record) + parser = get_parser(delimiter) + + with {:ok, headers} <- + parse_single_record(parser, header_record, "CSV file has no header row"), + {:ok, rows} <- parse_data_records(parser, data_records) do + {:ok, headers, rows} + end + end + + @spec strip_bom(binary()) :: binary() + defp strip_bom(<<@utf8_bom, rest::binary>>), do: rest + defp strip_bom(content), do: content + + @spec normalize_line_endings(binary()) :: binary() + defp normalize_line_endings(content) do + content + |> String.replace("\r\n", "\n") + |> String.replace("\r", "\n") + end + + @spec get_parser(String.t()) :: module() + defp get_parser(";"), do: Mv.Membership.Import.CsvParserSemicolon + defp get_parser(","), do: Mv.Membership.Import.CsvParserComma + defp get_parser(_), do: Mv.Membership.Import.CsvParserSemicolon + + # --- Delimiter detection (quote-aware by actually parsing the header) --- + + @spec detect_delimiter_by_parsing(binary()) :: String.t() + defp detect_delimiter_by_parsing(header_record) do + semicolon_score = header_field_count(Mv.Membership.Import.CsvParserSemicolon, header_record) + comma_score = header_field_count(Mv.Membership.Import.CsvParserComma, header_record) + + # prefer ";" on tie + if semicolon_score >= comma_score, do: ";", else: "," + end + + @spec header_field_count(module(), binary()) :: non_neg_integer() + defp header_field_count(parser, header_record) do + case parse_single_record(parser, header_record, nil) do + {:ok, fields} -> Enum.count(fields, &(String.trim(&1) != "")) + {:error, _} -> 0 + end + end + + # Parses exactly one record (string without trailing newline is fine). + # Returns {:ok, row} or {:error, reason}. + @spec parse_single_record(module(), binary(), String.t() | nil) :: + {:ok, row()} | {:error, String.t()} + defp parse_single_record(parser, record, error_reason_if_empty) do + try do + # NimbleCSV is happiest if there's a newline at the end. + rows = parser.parse_string(ensure_trailing_newline(record), skip_headers: false) + + case rows do + [row] when is_list(row) and row != [] -> + {:ok, row} + + _ -> + if is_binary(error_reason_if_empty), + do: {:error, error_reason_if_empty}, + else: {:error, "Failed to parse CSV header"} + end + rescue + e -> + {:error, "Failed to parse CSV: #{Exception.message(e)}"} + end + end + + @spec ensure_trailing_newline(binary()) :: binary() + defp ensure_trailing_newline(str) do + if String.ends_with?(str, "\n"), do: str, else: str <> "\n" + end + + # --- Data parsing preserving *physical* line numbers --- + + @spec parse_data_records(module(), [{line_number(), binary()}]) :: + {:ok, [numbered_row()]} | {:error, String.t()} + defp parse_data_records(parser, data_records) do + try do + rows = + data_records + |> Enum.reduce([], fn {line_no, record}, acc -> + case String.trim(record) do + "" -> + acc + + _ -> + parsed = parser.parse_string(ensure_trailing_newline(record), skip_headers: false) + + case parsed do + [row] when is_list(row) -> + if empty_row?(row) do + acc + else + [{line_no, row} | acc] + end + + # empty row or unparsable row -> treat as skipped empty row + _ -> + acc + end + end + end) + |> Enum.reverse() + + {:ok, rows} + rescue + e -> + {:error, "Failed to parse CSV: #{Exception.message(e)}"} + end + end + + @spec empty_row?(row()) :: boolean() + defp empty_row?(row) when is_list(row) do + Enum.all?(row, fn field -> String.trim(field) == "" end) + end + + # Check if quotes are balanced in the content. + # + # This is a simple check that counts quote characters. In CSV, escaped quotes + # are represented as "", so an odd number of quotes indicates unbalanced quotes. + @spec check_balanced_quotes(binary()) :: :ok | {:error, String.t()} + defp check_balanced_quotes(content) do + quote_count = content |> :binary.bin_to_list() |> Enum.count(&(&1 == @quote)) + + if rem(quote_count, 2) != 0 do + {:error, "Unbalanced quotes in CSV file"} + else + :ok + end + end + + # --- Record splitting with correct line numbers (quote-aware) --- + # + # This splits the CSV into "records" separated by newline *outside quotes*, + # returning [{start_line_number, record_string_without_newline}, ...] + # + # Line numbers are 1-based and represent the physical line in the CSV file. + # Empty lines are included in the numbering (they're just skipped later). + # + @spec split_records_with_line_numbers(binary()) :: [{line_number(), binary()}] + defp split_records_with_line_numbers(content) do + {acc, buf, _in_quotes, _line, start_line} = + do_split(content, [], [], false, 1, 1) + + # finalize last record only if there is buffered content + acc = + case buf do + [] -> + acc + + _ -> + record = buf |> Enum.reverse() |> :erlang.list_to_binary() + [{start_line, record} | acc] + end + + Enum.reverse(acc) + end + + # EOF + @spec do_split( + binary(), + [{line_number(), binary()}], + [byte()], + boolean(), + line_number(), + line_number() + ) :: + {[{line_number(), binary()}], [byte()], boolean(), line_number(), line_number()} + defp do_split(<<>>, acc, buf, in_quotes, line, start_line), + do: {acc, buf, in_quotes, line, start_line} + + # Escaped quote inside quoted field: "" -> keep both quotes, do NOT toggle in_quotes + defp do_split(<<@quote, @quote, rest::binary>>, acc, buf, true = in_quotes, line, start_line) do + do_split(rest, acc, [@quote, @quote | buf], in_quotes, line, start_line) + end + + # Quote toggles quote state (when not escaped "") + defp do_split(<<@quote, rest::binary>>, acc, buf, in_quotes, line, start_line) do + do_split(rest, acc, [@quote | buf], not in_quotes, line, start_line) + end + + # Newline outside quotes ends a record (even if empty) + defp do_split(<<"\n", rest::binary>>, acc, buf, false, line, start_line) do + record = buf |> Enum.reverse() |> :erlang.list_to_binary() + do_split(rest, [{start_line, record} | acc], [], false, line + 1, line + 1) + end + + # Newline inside quotes is part of the record, but advances physical line counter + defp do_split(<<"\n", rest::binary>>, acc, buf, true = in_quotes, line, start_line) do + do_split(rest, acc, [?\n | buf], in_quotes, line + 1, start_line) + end + + # Any other byte + defp do_split(<>, acc, buf, in_quotes, line, start_line) do + do_split(rest, acc, [ch | buf], in_quotes, line, start_line) + end +end