feat: add csv parser
This commit is contained in:
parent
699d4385cb
commit
68e19bea18
1 changed files with 276 additions and 0 deletions
276
lib/mv/membership/import/csv_parser.ex
Normal file
276
lib/mv/membership/import/csv_parser.ex
Normal file
|
|
@ -0,0 +1,276 @@
|
|||
NimbleCSV.define(Mv.Membership.Import.CsvParserSemicolon, separator: ";", escape: "\"")
|
||||
NimbleCSV.define(Mv.Membership.Import.CsvParserComma, separator: ",", escape: "\"")
|
||||
|
||||
defmodule Mv.Membership.Import.CsvParser do
|
||||
@moduledoc """
|
||||
CSV parser with BOM handling, delimiter auto-detection, and physical line numbering.
|
||||
|
||||
Guarantees:
|
||||
- UTF-8 BOM is stripped (Excel)
|
||||
- Delimiter auto-detected (semicolon/comma) using NimbleCSV parsing (quote-aware)
|
||||
- Returns rows tagged with their *physical start line number* in the CSV file (1-based)
|
||||
- Skips completely empty rows (but preserves numbering by using physical line numbers)
|
||||
- Handles `\\r\\n`, `\\n`, `\\r`
|
||||
- Correct even when fields contain newlines inside quotes: the row gets the start line number
|
||||
"""
|
||||
|
||||
@utf8_bom <<0xEF, 0xBB, 0xBF>>
|
||||
@quote ?"
|
||||
|
||||
@type line_number :: pos_integer()
|
||||
@type row :: [String.t()]
|
||||
@type numbered_row :: {line_number(), row()}
|
||||
|
||||
@spec parse(binary()) :: {:ok, row(), [numbered_row()]} | {:error, String.t()}
|
||||
def parse(file_content) when is_binary(file_content) do
|
||||
content =
|
||||
file_content
|
||||
|> strip_bom()
|
||||
|> normalize_line_endings()
|
||||
|
||||
with :ok <- validate_content_not_empty(content),
|
||||
:ok <- check_balanced_quotes(content),
|
||||
{:ok, header_record, data_records} <- extract_header_and_data(content),
|
||||
:ok <- validate_header_not_empty(header_record),
|
||||
{:ok, headers, rows} <- parse_csv_records(header_record, data_records) do
|
||||
{:ok, headers, rows}
|
||||
else
|
||||
{:error, reason} -> {:error, reason}
|
||||
end
|
||||
end
|
||||
|
||||
def parse(_), do: {:error, "Invalid CSV content"}
|
||||
|
||||
@spec validate_content_not_empty(binary()) :: :ok | {:error, String.t()}
|
||||
defp validate_content_not_empty(content) do
|
||||
if String.trim(content) == "" do
|
||||
{:error, "CSV file is empty"}
|
||||
else
|
||||
:ok
|
||||
end
|
||||
end
|
||||
|
||||
@spec extract_header_and_data(binary()) :: {:ok, binary(), [{line_number(), binary()}]} | {:error, String.t()}
|
||||
defp extract_header_and_data(content) do
|
||||
records = split_records_with_line_numbers(content)
|
||||
|
||||
case records do
|
||||
[] ->
|
||||
{:error, "CSV file is empty"}
|
||||
|
||||
[{_line1, header_record} | data_records] ->
|
||||
{:ok, header_record, data_records}
|
||||
end
|
||||
end
|
||||
|
||||
@spec validate_header_not_empty(binary()) :: :ok | {:error, String.t()}
|
||||
defp validate_header_not_empty(header_record) do
|
||||
if String.trim(header_record) == "" do
|
||||
{:error, "CSV file has no header row"}
|
||||
else
|
||||
:ok
|
||||
end
|
||||
end
|
||||
|
||||
@spec parse_csv_records(binary(), [{line_number(), binary()}]) ::
|
||||
{:ok, row(), [numbered_row()]} | {:error, String.t()}
|
||||
defp parse_csv_records(header_record, data_records) do
|
||||
delimiter = detect_delimiter_by_parsing(header_record)
|
||||
parser = get_parser(delimiter)
|
||||
|
||||
with {:ok, headers} <-
|
||||
parse_single_record(parser, header_record, "CSV file has no header row"),
|
||||
{:ok, rows} <- parse_data_records(parser, data_records) do
|
||||
{:ok, headers, rows}
|
||||
end
|
||||
end
|
||||
|
||||
@spec strip_bom(binary()) :: binary()
|
||||
defp strip_bom(<<@utf8_bom, rest::binary>>), do: rest
|
||||
defp strip_bom(content), do: content
|
||||
|
||||
@spec normalize_line_endings(binary()) :: binary()
|
||||
defp normalize_line_endings(content) do
|
||||
content
|
||||
|> String.replace("\r\n", "\n")
|
||||
|> String.replace("\r", "\n")
|
||||
end
|
||||
|
||||
@spec get_parser(String.t()) :: module()
|
||||
defp get_parser(";"), do: Mv.Membership.Import.CsvParserSemicolon
|
||||
defp get_parser(","), do: Mv.Membership.Import.CsvParserComma
|
||||
defp get_parser(_), do: Mv.Membership.Import.CsvParserSemicolon
|
||||
|
||||
# --- Delimiter detection (quote-aware by actually parsing the header) ---
|
||||
|
||||
@spec detect_delimiter_by_parsing(binary()) :: String.t()
|
||||
defp detect_delimiter_by_parsing(header_record) do
|
||||
semicolon_score = header_field_count(Mv.Membership.Import.CsvParserSemicolon, header_record)
|
||||
comma_score = header_field_count(Mv.Membership.Import.CsvParserComma, header_record)
|
||||
|
||||
# prefer ";" on tie
|
||||
if semicolon_score >= comma_score, do: ";", else: ","
|
||||
end
|
||||
|
||||
@spec header_field_count(module(), binary()) :: non_neg_integer()
|
||||
defp header_field_count(parser, header_record) do
|
||||
case parse_single_record(parser, header_record, nil) do
|
||||
{:ok, fields} -> Enum.count(fields, &(String.trim(&1) != ""))
|
||||
{:error, _} -> 0
|
||||
end
|
||||
end
|
||||
|
||||
# Parses exactly one record (string without trailing newline is fine).
|
||||
# Returns {:ok, row} or {:error, reason}.
|
||||
@spec parse_single_record(module(), binary(), String.t() | nil) ::
|
||||
{:ok, row()} | {:error, String.t()}
|
||||
defp parse_single_record(parser, record, error_reason_if_empty) do
|
||||
try do
|
||||
# NimbleCSV is happiest if there's a newline at the end.
|
||||
rows = parser.parse_string(ensure_trailing_newline(record), skip_headers: false)
|
||||
|
||||
case rows do
|
||||
[row] when is_list(row) and row != [] ->
|
||||
{:ok, row}
|
||||
|
||||
_ ->
|
||||
if is_binary(error_reason_if_empty),
|
||||
do: {:error, error_reason_if_empty},
|
||||
else: {:error, "Failed to parse CSV header"}
|
||||
end
|
||||
rescue
|
||||
e ->
|
||||
{:error, "Failed to parse CSV: #{Exception.message(e)}"}
|
||||
end
|
||||
end
|
||||
|
||||
@spec ensure_trailing_newline(binary()) :: binary()
|
||||
defp ensure_trailing_newline(str) do
|
||||
if String.ends_with?(str, "\n"), do: str, else: str <> "\n"
|
||||
end
|
||||
|
||||
# --- Data parsing preserving *physical* line numbers ---
|
||||
|
||||
@spec parse_data_records(module(), [{line_number(), binary()}]) ::
|
||||
{:ok, [numbered_row()]} | {:error, String.t()}
|
||||
defp parse_data_records(parser, data_records) do
|
||||
try do
|
||||
rows =
|
||||
data_records
|
||||
|> Enum.reduce([], fn {line_no, record}, acc ->
|
||||
case String.trim(record) do
|
||||
"" ->
|
||||
acc
|
||||
|
||||
_ ->
|
||||
parsed = parser.parse_string(ensure_trailing_newline(record), skip_headers: false)
|
||||
|
||||
case parsed do
|
||||
[row] when is_list(row) ->
|
||||
if empty_row?(row) do
|
||||
acc
|
||||
else
|
||||
[{line_no, row} | acc]
|
||||
end
|
||||
|
||||
# empty row or unparsable row -> treat as skipped empty row
|
||||
_ ->
|
||||
acc
|
||||
end
|
||||
end
|
||||
end)
|
||||
|> Enum.reverse()
|
||||
|
||||
{:ok, rows}
|
||||
rescue
|
||||
e ->
|
||||
{:error, "Failed to parse CSV: #{Exception.message(e)}"}
|
||||
end
|
||||
end
|
||||
|
||||
@spec empty_row?(row()) :: boolean()
|
||||
defp empty_row?(row) when is_list(row) do
|
||||
Enum.all?(row, fn field -> String.trim(field) == "" end)
|
||||
end
|
||||
|
||||
# Check if quotes are balanced in the content.
|
||||
#
|
||||
# This is a simple check that counts quote characters. In CSV, escaped quotes
|
||||
# are represented as "", so an odd number of quotes indicates unbalanced quotes.
|
||||
@spec check_balanced_quotes(binary()) :: :ok | {:error, String.t()}
|
||||
defp check_balanced_quotes(content) do
|
||||
quote_count = content |> :binary.bin_to_list() |> Enum.count(&(&1 == @quote))
|
||||
|
||||
if rem(quote_count, 2) != 0 do
|
||||
{:error, "Unbalanced quotes in CSV file"}
|
||||
else
|
||||
:ok
|
||||
end
|
||||
end
|
||||
|
||||
# --- Record splitting with correct line numbers (quote-aware) ---
|
||||
#
|
||||
# This splits the CSV into "records" separated by newline *outside quotes*,
|
||||
# returning [{start_line_number, record_string_without_newline}, ...]
|
||||
#
|
||||
# Line numbers are 1-based and represent the physical line in the CSV file.
|
||||
# Empty lines are included in the numbering (they're just skipped later).
|
||||
#
|
||||
@spec split_records_with_line_numbers(binary()) :: [{line_number(), binary()}]
|
||||
defp split_records_with_line_numbers(content) do
|
||||
{acc, buf, _in_quotes, _line, start_line} =
|
||||
do_split(content, [], [], false, 1, 1)
|
||||
|
||||
# finalize last record only if there is buffered content
|
||||
acc =
|
||||
case buf do
|
||||
[] ->
|
||||
acc
|
||||
|
||||
_ ->
|
||||
record = buf |> Enum.reverse() |> :erlang.list_to_binary()
|
||||
[{start_line, record} | acc]
|
||||
end
|
||||
|
||||
Enum.reverse(acc)
|
||||
end
|
||||
|
||||
# EOF
|
||||
@spec do_split(
|
||||
binary(),
|
||||
[{line_number(), binary()}],
|
||||
[byte()],
|
||||
boolean(),
|
||||
line_number(),
|
||||
line_number()
|
||||
) ::
|
||||
{[{line_number(), binary()}], [byte()], boolean(), line_number(), line_number()}
|
||||
defp do_split(<<>>, acc, buf, in_quotes, line, start_line),
|
||||
do: {acc, buf, in_quotes, line, start_line}
|
||||
|
||||
# Escaped quote inside quoted field: "" -> keep both quotes, do NOT toggle in_quotes
|
||||
defp do_split(<<@quote, @quote, rest::binary>>, acc, buf, true = in_quotes, line, start_line) do
|
||||
do_split(rest, acc, [@quote, @quote | buf], in_quotes, line, start_line)
|
||||
end
|
||||
|
||||
# Quote toggles quote state (when not escaped "")
|
||||
defp do_split(<<@quote, rest::binary>>, acc, buf, in_quotes, line, start_line) do
|
||||
do_split(rest, acc, [@quote | buf], not in_quotes, line, start_line)
|
||||
end
|
||||
|
||||
# Newline outside quotes ends a record (even if empty)
|
||||
defp do_split(<<"\n", rest::binary>>, acc, buf, false, line, start_line) do
|
||||
record = buf |> Enum.reverse() |> :erlang.list_to_binary()
|
||||
do_split(rest, [{start_line, record} | acc], [], false, line + 1, line + 1)
|
||||
end
|
||||
|
||||
# Newline inside quotes is part of the record, but advances physical line counter
|
||||
defp do_split(<<"\n", rest::binary>>, acc, buf, true = in_quotes, line, start_line) do
|
||||
do_split(rest, acc, [?\n | buf], in_quotes, line + 1, start_line)
|
||||
end
|
||||
|
||||
# Any other byte
|
||||
defp do_split(<<ch, rest::binary>>, acc, buf, in_quotes, line, start_line) do
|
||||
do_split(rest, acc, [ch | buf], in_quotes, line, start_line)
|
||||
end
|
||||
end
|
||||
Loading…
Add table
Add a link
Reference in a new issue