NimbleCSV.define(Mv.Membership.Import.CsvParserSemicolon, separator: ";", escape: "\"") NimbleCSV.define(Mv.Membership.Import.CsvParserComma, separator: ",", escape: "\"") defmodule Mv.Membership.Import.CsvParser do @moduledoc """ CSV parser with BOM handling, delimiter auto-detection, and physical line numbering. Guarantees: - UTF-8 BOM is stripped (Excel) - Delimiter auto-detected (semicolon/comma) using NimbleCSV parsing (quote-aware) - Returns rows tagged with their *physical start line number* in the CSV file (1-based) - Skips completely empty rows (but preserves numbering by using physical line numbers) - Handles `\\r\\n`, `\\n`, `\\r` - Correct even when fields contain newlines inside quotes: the row gets the start line number """ @utf8_bom <<0xEF, 0xBB, 0xBF>> @quote ?" @max_error_snippet_length 50 @type line_number :: pos_integer() @type row :: [String.t()] @type numbered_row :: {line_number(), row()} @spec parse(binary()) :: {:ok, row(), [numbered_row()]} | {:error, String.t()} def parse(file_content) when is_binary(file_content) do with :ok <- validate_utf8(file_content), content <- file_content |> strip_bom() |> normalize_line_endings(), :ok <- validate_content_not_empty(content), {:ok, header_record, data_records} <- extract_header_and_data(content), :ok <- validate_header_not_empty(header_record) do parse_csv_records(header_record, data_records) end end def parse(_), do: {:error, "Invalid CSV content"} @spec validate_utf8(binary()) :: :ok | {:error, String.t()} defp validate_utf8(content) do if String.valid?(content) do :ok else {:error, "CSV must be valid UTF-8"} end end @spec validate_content_not_empty(binary()) :: :ok | {:error, String.t()} defp validate_content_not_empty(content) do if String.trim(content) == "" do {:error, "CSV file is empty"} else :ok end end @spec extract_header_and_data(binary()) :: {:ok, binary(), [{line_number(), binary()}]} | {:error, String.t()} defp extract_header_and_data(content) do records = split_records_with_line_numbers(content) case records do [] -> {:error, "CSV file is empty"} [{_line1, header_record} | data_records] -> {:ok, header_record, data_records} end end @spec validate_header_not_empty(binary()) :: :ok | {:error, String.t()} defp validate_header_not_empty(header_record) do if String.trim(header_record) == "" do {:error, "CSV file has no header row"} else :ok end end @spec parse_csv_records(binary(), [{line_number(), binary()}]) :: {:ok, row(), [numbered_row()]} | {:error, String.t()} defp parse_csv_records(header_record, data_records) do delimiter = detect_delimiter_by_parsing(header_record) parser = get_parser(delimiter) with {:ok, headers} <- parse_single_record(parser, header_record, "CSV file has no header row"), {:ok, rows} <- parse_data_records(parser, data_records) do {:ok, headers, rows} end end @spec strip_bom(binary()) :: binary() defp strip_bom(<<@utf8_bom, rest::binary>>), do: rest defp strip_bom(content), do: content @spec normalize_line_endings(binary()) :: binary() defp normalize_line_endings(content) do content |> String.replace("\r\n", "\n") |> String.replace("\r", "\n") end @spec get_parser(String.t()) :: module() defp get_parser(";"), do: Mv.Membership.Import.CsvParserSemicolon defp get_parser(","), do: Mv.Membership.Import.CsvParserComma defp get_parser(_), do: Mv.Membership.Import.CsvParserSemicolon # --- Delimiter detection (quote-aware by actually parsing the header) --- @spec detect_delimiter_by_parsing(binary()) :: String.t() defp detect_delimiter_by_parsing(header_record) do semicolon_score = header_field_count(Mv.Membership.Import.CsvParserSemicolon, header_record) comma_score = header_field_count(Mv.Membership.Import.CsvParserComma, header_record) # prefer ";" on tie if semicolon_score >= comma_score, do: ";", else: "," end @spec header_field_count(module(), binary()) :: non_neg_integer() defp header_field_count(parser, header_record) do case parse_single_record(parser, header_record, nil) do {:ok, fields} -> Enum.count(fields, &(String.trim(&1) != "")) {:error, _} -> 0 end end # Parses exactly one record (string without trailing newline is fine). # Returns `{:ok, row}` or `{:error, reason}`. @spec parse_single_record(module(), binary(), String.t() | nil) :: {:ok, row()} | {:error, String.t()} defp parse_single_record(parser, record, error_reason_if_empty) do # NimbleCSV is happiest if there's a newline at the end. rows = parser.parse_string(ensure_trailing_newline(record), skip_headers: false) case rows do [row] when is_list(row) and row != [] -> {:ok, row} _ -> if is_binary(error_reason_if_empty), do: {:error, error_reason_if_empty}, else: {:error, "Failed to parse CSV header"} end rescue e -> {:error, "Failed to parse CSV: #{Exception.message(e)}"} end @spec ensure_trailing_newline(binary()) :: binary() defp ensure_trailing_newline(str) do if String.ends_with?(str, "\n"), do: str, else: str <> "\n" end # --- Data parsing preserving *physical* line numbers --- # # Parses data records while preserving physical line numbers. # Skips empty rows but maintains correct line numbering for error reporting. # @spec parse_data_records(module(), [{line_number(), binary()}]) :: {:ok, [numbered_row()]} | {:error, String.t()} defp parse_data_records(parser, data_records) do rows = data_records |> Enum.reduce_while([], fn {line_no, record}, acc -> process_data_record(parser, line_no, record, acc) end) case rows do {:error, reason} -> {:error, reason} rows -> {:ok, Enum.reverse(rows)} end rescue e -> {:error, "Failed to parse CSV: #{Exception.message(e)}"} end @spec process_data_record(module(), line_number(), binary(), [numbered_row()]) :: {:cont, [numbered_row()]} | {:halt, {:error, String.t()}} defp process_data_record(parser, line_no, record, acc) do trimmed = String.trim(record) if trimmed == "" do {:cont, acc} else process_non_empty_record(parser, line_no, record, acc) end end @spec process_non_empty_record(module(), line_number(), binary(), [numbered_row()]) :: {:cont, [numbered_row()]} | {:halt, {:error, String.t()}} defp process_non_empty_record(parser, line_no, record, acc) do parsed = parser.parse_string(ensure_trailing_newline(record), skip_headers: false) case parsed do [row] when is_list(row) -> if empty_row?(row) do {:cont, acc} else {:cont, [{line_no, row} | acc]} end # unparsable row -> return error with line number _ -> snippet = String.slice(record, 0, min(@max_error_snippet_length, String.length(record))) {:halt, {:error, "Failed to parse CSV data at line #{line_no}: #{inspect(snippet)}"}} end end @spec empty_row?(row()) :: boolean() defp empty_row?(row) when is_list(row) do Enum.all?(row, fn field -> String.trim(field) == "" end) end # --- Record splitting with correct line numbers (quote-aware) --- # # Splits the CSV into records separated by newline *outside quotes*. # Returns `[{start_line_number, record_string_without_newline}, ...]`. # # Line numbers are 1-based and represent the physical line in the CSV file. # Empty lines are included in the numbering (they're just skipped later). # @spec split_records_with_line_numbers(binary()) :: [{line_number(), binary()}] defp split_records_with_line_numbers(content) do {acc, buf, _in_quotes, _line, start_line} = do_split(content, [], [], false, 1, 1) # finalize last record only if there is buffered content acc = case buf do [] -> acc _ -> record = buf |> Enum.reverse() |> :erlang.list_to_binary() [{start_line, record} | acc] end Enum.reverse(acc) end # Recursively splits CSV content into records with correct line numbering. # # Handles quote-aware parsing: # - Escaped quotes (`""`) inside quoted fields are preserved # - Newlines inside quotes are part of the record but advance line counter # - Newlines outside quotes end a record # # Parameters: # - `content` - Remaining binary content to parse # - `acc` - Accumulated records `[{line_number, record}, ...]` # - `buf` - Current record buffer (reversed byte list) # - `in_quotes` - Whether we're currently inside a quoted field # - `line` - Current physical line number # - `start_line` - Line number where current record started # @spec do_split( binary(), [{line_number(), binary()}], [byte()], boolean(), line_number(), line_number() ) :: {[{line_number(), binary()}], [byte()], boolean(), line_number(), line_number()} defp do_split(<<>>, acc, buf, in_quotes, line, start_line), do: {acc, buf, in_quotes, line, start_line} # Escaped quote inside quoted field: "" -> keep both quotes, do NOT toggle in_quotes defp do_split(<<@quote, @quote, rest::binary>>, acc, buf, true = in_quotes, line, start_line) do do_split(rest, acc, [@quote, @quote | buf], in_quotes, line, start_line) end # Quote toggles quote state (when not escaped "") defp do_split(<<@quote, rest::binary>>, acc, buf, in_quotes, line, start_line) do do_split(rest, acc, [@quote | buf], not in_quotes, line, start_line) end # Newline outside quotes ends a record (even if empty) defp do_split(<<"\n", rest::binary>>, acc, buf, false, line, start_line) do record = buf |> Enum.reverse() |> :erlang.list_to_binary() do_split(rest, [{start_line, record} | acc], [], false, line + 1, line + 1) end # Newline inside quotes is part of the record, but advances physical line counter defp do_split(<<"\n", rest::binary>>, acc, buf, true = in_quotes, line, start_line) do do_split(rest, acc, [?\n | buf], in_quotes, line + 1, start_line) end # Any other byte defp do_split(<>, acc, buf, in_quotes, line, start_line) do do_split(rest, acc, [ch | buf], in_quotes, line, start_line) end end