From 8a5d012895903c458836291a3e85f83c6ccdfc98 Mon Sep 17 00:00:00 2001 From: carla Date: Thu, 15 Jan 2026 12:15:22 +0100 Subject: [PATCH] refactor parser --- lib/mv/membership/import/csv_parser.ex | 136 +++++++++++++++---------- 1 file changed, 81 insertions(+), 55 deletions(-) diff --git a/lib/mv/membership/import/csv_parser.ex b/lib/mv/membership/import/csv_parser.ex index 22a7496..2de75ee 100644 --- a/lib/mv/membership/import/csv_parser.ex +++ b/lib/mv/membership/import/csv_parser.ex @@ -16,6 +16,7 @@ defmodule Mv.Membership.Import.CsvParser do @utf8_bom <<0xEF, 0xBB, 0xBF>> @quote ?" + @max_error_snippet_length 50 @type line_number :: pos_integer() @type row :: [String.t()] @@ -27,11 +28,8 @@ defmodule Mv.Membership.Import.CsvParser do content <- file_content |> strip_bom() |> normalize_line_endings(), :ok <- validate_content_not_empty(content), {:ok, header_record, data_records} <- extract_header_and_data(content), - :ok <- validate_header_not_empty(header_record), - {:ok, headers, rows} <- parse_csv_records(header_record, data_records) do - {:ok, headers, rows} - else - {:error, reason} -> {:error, reason} + :ok <- validate_header_not_empty(header_record) do + parse_csv_records(header_record, data_records) end end @@ -55,7 +53,8 @@ defmodule Mv.Membership.Import.CsvParser do end end - @spec extract_header_and_data(binary()) :: {:ok, binary(), [{line_number(), binary()}]} | {:error, String.t()} + @spec extract_header_and_data(binary()) :: + {:ok, binary(), [{line_number(), binary()}]} | {:error, String.t()} defp extract_header_and_data(content) do records = split_records_with_line_numbers(content) @@ -126,27 +125,25 @@ defmodule Mv.Membership.Import.CsvParser do end # Parses exactly one record (string without trailing newline is fine). - # Returns {:ok, row} or {:error, reason}. + # Returns `{:ok, row}` or `{:error, reason}`. @spec parse_single_record(module(), binary(), String.t() | nil) :: {:ok, row()} | {:error, String.t()} defp parse_single_record(parser, record, error_reason_if_empty) do - try do - # NimbleCSV is happiest if there's a newline at the end. - rows = parser.parse_string(ensure_trailing_newline(record), skip_headers: false) + # NimbleCSV is happiest if there's a newline at the end. + rows = parser.parse_string(ensure_trailing_newline(record), skip_headers: false) - case rows do - [row] when is_list(row) and row != [] -> - {:ok, row} + case rows do + [row] when is_list(row) and row != [] -> + {:ok, row} - _ -> - if is_binary(error_reason_if_empty), - do: {:error, error_reason_if_empty}, - else: {:error, "Failed to parse CSV header"} - end - rescue - e -> - {:error, "Failed to parse CSV: #{Exception.message(e)}"} + _ -> + if is_binary(error_reason_if_empty), + do: {:error, error_reason_if_empty}, + else: {:error, "Failed to parse CSV header"} end + rescue + e -> + {:error, "Failed to parse CSV: #{Exception.message(e)}"} end @spec ensure_trailing_newline(binary()) :: binary() @@ -155,44 +152,59 @@ defmodule Mv.Membership.Import.CsvParser do end # --- Data parsing preserving *physical* line numbers --- - + # + # Parses data records while preserving physical line numbers. + # Skips empty rows but maintains correct line numbering for error reporting. + # @spec parse_data_records(module(), [{line_number(), binary()}]) :: {:ok, [numbered_row()]} | {:error, String.t()} defp parse_data_records(parser, data_records) do - try do - rows = - data_records - |> Enum.reduce_while([], fn {line_no, record}, acc -> - case String.trim(record) do - "" -> - {:cont, acc} + rows = + data_records + |> Enum.reduce_while([], fn {line_no, record}, acc -> + process_data_record(parser, line_no, record, acc) + end) - _ -> - parsed = parser.parse_string(ensure_trailing_newline(record), skip_headers: false) + case rows do + {:error, reason} -> {:error, reason} + rows -> {:ok, Enum.reverse(rows)} + end + rescue + e -> + {:error, "Failed to parse CSV: #{Exception.message(e)}"} + end - case parsed do - [row] when is_list(row) -> - if empty_row?(row) do - {:cont, acc} - else - {:cont, [{line_no, row} | acc]} - end + @spec process_data_record(module(), line_number(), binary(), [numbered_row()]) :: + {:cont, [numbered_row()]} | {:halt, {:error, String.t()}} + defp process_data_record(parser, line_no, record, acc) do + trimmed = String.trim(record) - # unparsable row -> return error with line number - _ -> - snippet = String.slice(record, 0, min(50, String.length(record))) - {:halt, {:error, "Failed to parse CSV data at line #{line_no}: #{inspect(snippet)}"}} - end - end - end) + if trimmed == "" do + {:cont, acc} + else + process_non_empty_record(parser, line_no, record, acc) + end + end - case rows do - {:error, reason} -> {:error, reason} - rows -> {:ok, Enum.reverse(rows)} - end - rescue - e -> - {:error, "Failed to parse CSV: #{Exception.message(e)}"} + @spec process_non_empty_record(module(), line_number(), binary(), [numbered_row()]) :: + {:cont, [numbered_row()]} | {:halt, {:error, String.t()}} + defp process_non_empty_record(parser, line_no, record, acc) do + parsed = parser.parse_string(ensure_trailing_newline(record), skip_headers: false) + + case parsed do + [row] when is_list(row) -> + if empty_row?(row) do + {:cont, acc} + else + {:cont, [{line_no, row} | acc]} + end + + # unparsable row -> return error with line number + _ -> + snippet = + String.slice(record, 0, min(@max_error_snippet_length, String.length(record))) + + {:halt, {:error, "Failed to parse CSV data at line #{line_no}: #{inspect(snippet)}"}} end end @@ -203,8 +215,8 @@ defmodule Mv.Membership.Import.CsvParser do # --- Record splitting with correct line numbers (quote-aware) --- # - # This splits the CSV into "records" separated by newline *outside quotes*, - # returning [{start_line_number, record_string_without_newline}, ...] + # Splits the CSV into records separated by newline *outside quotes*. + # Returns `[{start_line_number, record_string_without_newline}, ...]`. # # Line numbers are 1-based and represent the physical line in the CSV file. # Empty lines are included in the numbering (they're just skipped later). @@ -228,7 +240,21 @@ defmodule Mv.Membership.Import.CsvParser do Enum.reverse(acc) end - # EOF + # Recursively splits CSV content into records with correct line numbering. + # + # Handles quote-aware parsing: + # - Escaped quotes (`""`) inside quoted fields are preserved + # - Newlines inside quotes are part of the record but advance line counter + # - Newlines outside quotes end a record + # + # Parameters: + # - `content` - Remaining binary content to parse + # - `acc` - Accumulated records `[{line_number, record}, ...]` + # - `buf` - Current record buffer (reversed byte list) + # - `in_quotes` - Whether we're currently inside a quoted field + # - `line` - Current physical line number + # - `start_line` - Line number where current record started + # @spec do_split( binary(), [{line_number(), binary()}],