refactor parser
This commit is contained in:
parent
3bbe9895ee
commit
8a5d012895
1 changed files with 81 additions and 55 deletions
|
|
@ -16,6 +16,7 @@ defmodule Mv.Membership.Import.CsvParser do
|
||||||
|
|
||||||
@utf8_bom <<0xEF, 0xBB, 0xBF>>
|
@utf8_bom <<0xEF, 0xBB, 0xBF>>
|
||||||
@quote ?"
|
@quote ?"
|
||||||
|
@max_error_snippet_length 50
|
||||||
|
|
||||||
@type line_number :: pos_integer()
|
@type line_number :: pos_integer()
|
||||||
@type row :: [String.t()]
|
@type row :: [String.t()]
|
||||||
|
|
@ -27,11 +28,8 @@ defmodule Mv.Membership.Import.CsvParser do
|
||||||
content <- file_content |> strip_bom() |> normalize_line_endings(),
|
content <- file_content |> strip_bom() |> normalize_line_endings(),
|
||||||
:ok <- validate_content_not_empty(content),
|
:ok <- validate_content_not_empty(content),
|
||||||
{:ok, header_record, data_records} <- extract_header_and_data(content),
|
{:ok, header_record, data_records} <- extract_header_and_data(content),
|
||||||
:ok <- validate_header_not_empty(header_record),
|
:ok <- validate_header_not_empty(header_record) do
|
||||||
{:ok, headers, rows} <- parse_csv_records(header_record, data_records) do
|
parse_csv_records(header_record, data_records)
|
||||||
{:ok, headers, rows}
|
|
||||||
else
|
|
||||||
{:error, reason} -> {:error, reason}
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
@ -55,7 +53,8 @@ defmodule Mv.Membership.Import.CsvParser do
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
@spec extract_header_and_data(binary()) :: {:ok, binary(), [{line_number(), binary()}]} | {:error, String.t()}
|
@spec extract_header_and_data(binary()) ::
|
||||||
|
{:ok, binary(), [{line_number(), binary()}]} | {:error, String.t()}
|
||||||
defp extract_header_and_data(content) do
|
defp extract_header_and_data(content) do
|
||||||
records = split_records_with_line_numbers(content)
|
records = split_records_with_line_numbers(content)
|
||||||
|
|
||||||
|
|
@ -126,27 +125,25 @@ defmodule Mv.Membership.Import.CsvParser do
|
||||||
end
|
end
|
||||||
|
|
||||||
# Parses exactly one record (string without trailing newline is fine).
|
# Parses exactly one record (string without trailing newline is fine).
|
||||||
# Returns {:ok, row} or {:error, reason}.
|
# Returns `{:ok, row}` or `{:error, reason}`.
|
||||||
@spec parse_single_record(module(), binary(), String.t() | nil) ::
|
@spec parse_single_record(module(), binary(), String.t() | nil) ::
|
||||||
{:ok, row()} | {:error, String.t()}
|
{:ok, row()} | {:error, String.t()}
|
||||||
defp parse_single_record(parser, record, error_reason_if_empty) do
|
defp parse_single_record(parser, record, error_reason_if_empty) do
|
||||||
try do
|
# NimbleCSV is happiest if there's a newline at the end.
|
||||||
# NimbleCSV is happiest if there's a newline at the end.
|
rows = parser.parse_string(ensure_trailing_newline(record), skip_headers: false)
|
||||||
rows = parser.parse_string(ensure_trailing_newline(record), skip_headers: false)
|
|
||||||
|
|
||||||
case rows do
|
case rows do
|
||||||
[row] when is_list(row) and row != [] ->
|
[row] when is_list(row) and row != [] ->
|
||||||
{:ok, row}
|
{:ok, row}
|
||||||
|
|
||||||
_ ->
|
_ ->
|
||||||
if is_binary(error_reason_if_empty),
|
if is_binary(error_reason_if_empty),
|
||||||
do: {:error, error_reason_if_empty},
|
do: {:error, error_reason_if_empty},
|
||||||
else: {:error, "Failed to parse CSV header"}
|
else: {:error, "Failed to parse CSV header"}
|
||||||
end
|
|
||||||
rescue
|
|
||||||
e ->
|
|
||||||
{:error, "Failed to parse CSV: #{Exception.message(e)}"}
|
|
||||||
end
|
end
|
||||||
|
rescue
|
||||||
|
e ->
|
||||||
|
{:error, "Failed to parse CSV: #{Exception.message(e)}"}
|
||||||
end
|
end
|
||||||
|
|
||||||
@spec ensure_trailing_newline(binary()) :: binary()
|
@spec ensure_trailing_newline(binary()) :: binary()
|
||||||
|
|
@ -155,44 +152,59 @@ defmodule Mv.Membership.Import.CsvParser do
|
||||||
end
|
end
|
||||||
|
|
||||||
# --- Data parsing preserving *physical* line numbers ---
|
# --- Data parsing preserving *physical* line numbers ---
|
||||||
|
#
|
||||||
|
# Parses data records while preserving physical line numbers.
|
||||||
|
# Skips empty rows but maintains correct line numbering for error reporting.
|
||||||
|
#
|
||||||
@spec parse_data_records(module(), [{line_number(), binary()}]) ::
|
@spec parse_data_records(module(), [{line_number(), binary()}]) ::
|
||||||
{:ok, [numbered_row()]} | {:error, String.t()}
|
{:ok, [numbered_row()]} | {:error, String.t()}
|
||||||
defp parse_data_records(parser, data_records) do
|
defp parse_data_records(parser, data_records) do
|
||||||
try do
|
rows =
|
||||||
rows =
|
data_records
|
||||||
data_records
|
|> Enum.reduce_while([], fn {line_no, record}, acc ->
|
||||||
|> Enum.reduce_while([], fn {line_no, record}, acc ->
|
process_data_record(parser, line_no, record, acc)
|
||||||
case String.trim(record) do
|
end)
|
||||||
"" ->
|
|
||||||
{:cont, acc}
|
|
||||||
|
|
||||||
_ ->
|
case rows do
|
||||||
parsed = parser.parse_string(ensure_trailing_newline(record), skip_headers: false)
|
{:error, reason} -> {:error, reason}
|
||||||
|
rows -> {:ok, Enum.reverse(rows)}
|
||||||
|
end
|
||||||
|
rescue
|
||||||
|
e ->
|
||||||
|
{:error, "Failed to parse CSV: #{Exception.message(e)}"}
|
||||||
|
end
|
||||||
|
|
||||||
case parsed do
|
@spec process_data_record(module(), line_number(), binary(), [numbered_row()]) ::
|
||||||
[row] when is_list(row) ->
|
{:cont, [numbered_row()]} | {:halt, {:error, String.t()}}
|
||||||
if empty_row?(row) do
|
defp process_data_record(parser, line_no, record, acc) do
|
||||||
{:cont, acc}
|
trimmed = String.trim(record)
|
||||||
else
|
|
||||||
{:cont, [{line_no, row} | acc]}
|
|
||||||
end
|
|
||||||
|
|
||||||
# unparsable row -> return error with line number
|
if trimmed == "" do
|
||||||
_ ->
|
{:cont, acc}
|
||||||
snippet = String.slice(record, 0, min(50, String.length(record)))
|
else
|
||||||
{:halt, {:error, "Failed to parse CSV data at line #{line_no}: #{inspect(snippet)}"}}
|
process_non_empty_record(parser, line_no, record, acc)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end)
|
|
||||||
|
|
||||||
case rows do
|
@spec process_non_empty_record(module(), line_number(), binary(), [numbered_row()]) ::
|
||||||
{:error, reason} -> {:error, reason}
|
{:cont, [numbered_row()]} | {:halt, {:error, String.t()}}
|
||||||
rows -> {:ok, Enum.reverse(rows)}
|
defp process_non_empty_record(parser, line_no, record, acc) do
|
||||||
end
|
parsed = parser.parse_string(ensure_trailing_newline(record), skip_headers: false)
|
||||||
rescue
|
|
||||||
e ->
|
case parsed do
|
||||||
{:error, "Failed to parse CSV: #{Exception.message(e)}"}
|
[row] when is_list(row) ->
|
||||||
|
if empty_row?(row) do
|
||||||
|
{:cont, acc}
|
||||||
|
else
|
||||||
|
{:cont, [{line_no, row} | acc]}
|
||||||
|
end
|
||||||
|
|
||||||
|
# unparsable row -> return error with line number
|
||||||
|
_ ->
|
||||||
|
snippet =
|
||||||
|
String.slice(record, 0, min(@max_error_snippet_length, String.length(record)))
|
||||||
|
|
||||||
|
{:halt, {:error, "Failed to parse CSV data at line #{line_no}: #{inspect(snippet)}"}}
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
@ -203,8 +215,8 @@ defmodule Mv.Membership.Import.CsvParser do
|
||||||
|
|
||||||
# --- Record splitting with correct line numbers (quote-aware) ---
|
# --- Record splitting with correct line numbers (quote-aware) ---
|
||||||
#
|
#
|
||||||
# This splits the CSV into "records" separated by newline *outside quotes*,
|
# Splits the CSV into records separated by newline *outside quotes*.
|
||||||
# returning [{start_line_number, record_string_without_newline}, ...]
|
# Returns `[{start_line_number, record_string_without_newline}, ...]`.
|
||||||
#
|
#
|
||||||
# Line numbers are 1-based and represent the physical line in the CSV file.
|
# Line numbers are 1-based and represent the physical line in the CSV file.
|
||||||
# Empty lines are included in the numbering (they're just skipped later).
|
# Empty lines are included in the numbering (they're just skipped later).
|
||||||
|
|
@ -228,7 +240,21 @@ defmodule Mv.Membership.Import.CsvParser do
|
||||||
Enum.reverse(acc)
|
Enum.reverse(acc)
|
||||||
end
|
end
|
||||||
|
|
||||||
# EOF
|
# Recursively splits CSV content into records with correct line numbering.
|
||||||
|
#
|
||||||
|
# Handles quote-aware parsing:
|
||||||
|
# - Escaped quotes (`""`) inside quoted fields are preserved
|
||||||
|
# - Newlines inside quotes are part of the record but advance line counter
|
||||||
|
# - Newlines outside quotes end a record
|
||||||
|
#
|
||||||
|
# Parameters:
|
||||||
|
# - `content` - Remaining binary content to parse
|
||||||
|
# - `acc` - Accumulated records `[{line_number, record}, ...]`
|
||||||
|
# - `buf` - Current record buffer (reversed byte list)
|
||||||
|
# - `in_quotes` - Whether we're currently inside a quoted field
|
||||||
|
# - `line` - Current physical line number
|
||||||
|
# - `start_line` - Line number where current record started
|
||||||
|
#
|
||||||
@spec do_split(
|
@spec do_split(
|
||||||
binary(),
|
binary(),
|
||||||
[{line_number(), binary()}],
|
[{line_number(), binary()}],
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue