Implement CSV parsr closes #331 #351

Merged
carla merged 5 commits from feature/331_scv_parsing into main 2026-01-15 13:38:37 +01:00
Showing only changes of commit 8a5d012895 - Show all commits

View file

@ -16,6 +16,7 @@ defmodule Mv.Membership.Import.CsvParser do
@utf8_bom <<0xEF, 0xBB, 0xBF>> @utf8_bom <<0xEF, 0xBB, 0xBF>>
@quote ?" @quote ?"
@max_error_snippet_length 50
@type line_number :: pos_integer() @type line_number :: pos_integer()
@type row :: [String.t()] @type row :: [String.t()]
@ -27,11 +28,8 @@ defmodule Mv.Membership.Import.CsvParser do
content <- file_content |> strip_bom() |> normalize_line_endings(), content <- file_content |> strip_bom() |> normalize_line_endings(),
:ok <- validate_content_not_empty(content), :ok <- validate_content_not_empty(content),
{:ok, header_record, data_records} <- extract_header_and_data(content), {:ok, header_record, data_records} <- extract_header_and_data(content),
:ok <- validate_header_not_empty(header_record), :ok <- validate_header_not_empty(header_record) do
{:ok, headers, rows} <- parse_csv_records(header_record, data_records) do parse_csv_records(header_record, data_records)
{:ok, headers, rows}
else
{:error, reason} -> {:error, reason}
end end
end end
@ -55,7 +53,8 @@ defmodule Mv.Membership.Import.CsvParser do
end end
end end
@spec extract_header_and_data(binary()) :: {:ok, binary(), [{line_number(), binary()}]} | {:error, String.t()} @spec extract_header_and_data(binary()) ::
{:ok, binary(), [{line_number(), binary()}]} | {:error, String.t()}
defp extract_header_and_data(content) do defp extract_header_and_data(content) do
records = split_records_with_line_numbers(content) records = split_records_with_line_numbers(content)
@ -126,11 +125,10 @@ defmodule Mv.Membership.Import.CsvParser do
end end
# Parses exactly one record (string without trailing newline is fine). # Parses exactly one record (string without trailing newline is fine).
# Returns {:ok, row} or {:error, reason}. # Returns `{:ok, row}` or `{:error, reason}`.
@spec parse_single_record(module(), binary(), String.t() | nil) :: @spec parse_single_record(module(), binary(), String.t() | nil) ::
{:ok, row()} | {:error, String.t()} {:ok, row()} | {:error, String.t()}
defp parse_single_record(parser, record, error_reason_if_empty) do defp parse_single_record(parser, record, error_reason_if_empty) do
try do
# NimbleCSV is happiest if there's a newline at the end. # NimbleCSV is happiest if there's a newline at the end.
rows = parser.parse_string(ensure_trailing_newline(record), skip_headers: false) rows = parser.parse_string(ensure_trailing_newline(record), skip_headers: false)
@ -147,7 +145,6 @@ defmodule Mv.Membership.Import.CsvParser do
e -> e ->
{:error, "Failed to parse CSV: #{Exception.message(e)}"} {:error, "Failed to parse CSV: #{Exception.message(e)}"}
end end
end
@spec ensure_trailing_newline(binary()) :: binary() @spec ensure_trailing_newline(binary()) :: binary()
defp ensure_trailing_newline(str) do defp ensure_trailing_newline(str) do
@ -155,19 +152,43 @@ defmodule Mv.Membership.Import.CsvParser do
end end
# --- Data parsing preserving *physical* line numbers --- # --- Data parsing preserving *physical* line numbers ---
#
# Parses data records while preserving physical line numbers.
# Skips empty rows but maintains correct line numbering for error reporting.
#
@spec parse_data_records(module(), [{line_number(), binary()}]) :: @spec parse_data_records(module(), [{line_number(), binary()}]) ::
{:ok, [numbered_row()]} | {:error, String.t()} {:ok, [numbered_row()]} | {:error, String.t()}
defp parse_data_records(parser, data_records) do defp parse_data_records(parser, data_records) do
try do
rows = rows =
data_records data_records
|> Enum.reduce_while([], fn {line_no, record}, acc -> |> Enum.reduce_while([], fn {line_no, record}, acc ->
case String.trim(record) do process_data_record(parser, line_no, record, acc)
"" -> end)
{:cont, acc}
_ -> case rows do
{:error, reason} -> {:error, reason}
rows -> {:ok, Enum.reverse(rows)}
end
rescue
e ->
{:error, "Failed to parse CSV: #{Exception.message(e)}"}
end
@spec process_data_record(module(), line_number(), binary(), [numbered_row()]) ::
{:cont, [numbered_row()]} | {:halt, {:error, String.t()}}
defp process_data_record(parser, line_no, record, acc) do
trimmed = String.trim(record)
if trimmed == "" do
{:cont, acc}
else
process_non_empty_record(parser, line_no, record, acc)
end
end
@spec process_non_empty_record(module(), line_number(), binary(), [numbered_row()]) ::
{:cont, [numbered_row()]} | {:halt, {:error, String.t()}}
defp process_non_empty_record(parser, line_no, record, acc) do
parsed = parser.parse_string(ensure_trailing_newline(record), skip_headers: false) parsed = parser.parse_string(ensure_trailing_newline(record), skip_headers: false)
case parsed do case parsed do
@ -180,19 +201,10 @@ defmodule Mv.Membership.Import.CsvParser do
# unparsable row -> return error with line number # unparsable row -> return error with line number
_ -> _ ->
snippet = String.slice(record, 0, min(50, String.length(record))) snippet =
{:halt, {:error, "Failed to parse CSV data at line #{line_no}: #{inspect(snippet)}"}} String.slice(record, 0, min(@max_error_snippet_length, String.length(record)))
end
end
end)
case rows do {:halt, {:error, "Failed to parse CSV data at line #{line_no}: #{inspect(snippet)}"}}
{:error, reason} -> {:error, reason}
rows -> {:ok, Enum.reverse(rows)}
end
rescue
e ->
{:error, "Failed to parse CSV: #{Exception.message(e)}"}
end end
end end
@ -203,8 +215,8 @@ defmodule Mv.Membership.Import.CsvParser do
# --- Record splitting with correct line numbers (quote-aware) --- # --- Record splitting with correct line numbers (quote-aware) ---
# #
# This splits the CSV into "records" separated by newline *outside quotes*, # Splits the CSV into records separated by newline *outside quotes*.
# returning [{start_line_number, record_string_without_newline}, ...] # Returns `[{start_line_number, record_string_without_newline}, ...]`.
# #
# Line numbers are 1-based and represent the physical line in the CSV file. # Line numbers are 1-based and represent the physical line in the CSV file.
# Empty lines are included in the numbering (they're just skipped later). # Empty lines are included in the numbering (they're just skipped later).
@ -228,7 +240,21 @@ defmodule Mv.Membership.Import.CsvParser do
Enum.reverse(acc) Enum.reverse(acc)
end end
# EOF # Recursively splits CSV content into records with correct line numbering.
#
# Handles quote-aware parsing:
# - Escaped quotes (`""`) inside quoted fields are preserved
# - Newlines inside quotes are part of the record but advance line counter
# - Newlines outside quotes end a record
#
# Parameters:
# - `content` - Remaining binary content to parse
# - `acc` - Accumulated records `[{line_number, record}, ...]`
# - `buf` - Current record buffer (reversed byte list)
# - `in_quotes` - Whether we're currently inside a quoted field
# - `line` - Current physical line number
# - `start_line` - Line number where current record started
#
@spec do_split( @spec do_split(
binary(), binary(),
[{line_number(), binary()}], [{line_number(), binary()}],