Implement CSV parsr closes #331 #351
2 changed files with 51 additions and 34 deletions
|
|
@ -23,13 +23,9 @@ defmodule Mv.Membership.Import.CsvParser do
|
||||||
|
|
||||||
@spec parse(binary()) :: {:ok, row(), [numbered_row()]} | {:error, String.t()}
|
@spec parse(binary()) :: {:ok, row(), [numbered_row()]} | {:error, String.t()}
|
||||||
def parse(file_content) when is_binary(file_content) do
|
def parse(file_content) when is_binary(file_content) do
|
||||||
content =
|
with :ok <- validate_utf8(file_content),
|
||||||
file_content
|
content <- file_content |> strip_bom() |> normalize_line_endings(),
|
||||||
|> strip_bom()
|
:ok <- validate_content_not_empty(content),
|
||||||
|> normalize_line_endings()
|
|
||||||
|
|
||||||
with :ok <- validate_content_not_empty(content),
|
|
||||||
:ok <- check_balanced_quotes(content),
|
|
||||||
{:ok, header_record, data_records} <- extract_header_and_data(content),
|
{:ok, header_record, data_records} <- extract_header_and_data(content),
|
||||||
:ok <- validate_header_not_empty(header_record),
|
:ok <- validate_header_not_empty(header_record),
|
||||||
{:ok, headers, rows} <- parse_csv_records(header_record, data_records) do
|
{:ok, headers, rows} <- parse_csv_records(header_record, data_records) do
|
||||||
|
|
@ -41,6 +37,15 @@ defmodule Mv.Membership.Import.CsvParser do
|
||||||
|
|
||||||
def parse(_), do: {:error, "Invalid CSV content"}
|
def parse(_), do: {:error, "Invalid CSV content"}
|
||||||
|
|
||||||
|
@spec validate_utf8(binary()) :: :ok | {:error, String.t()}
|
||||||
|
defp validate_utf8(content) do
|
||||||
|
if String.valid?(content) do
|
||||||
|
:ok
|
||||||
|
else
|
||||||
|
{:error, "CSV must be valid UTF-8"}
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
@spec validate_content_not_empty(binary()) :: :ok | {:error, String.t()}
|
@spec validate_content_not_empty(binary()) :: :ok | {:error, String.t()}
|
||||||
defp validate_content_not_empty(content) do
|
defp validate_content_not_empty(content) do
|
||||||
if String.trim(content) == "" do
|
if String.trim(content) == "" do
|
||||||
|
|
@ -157,10 +162,10 @@ defmodule Mv.Membership.Import.CsvParser do
|
||||||
try do
|
try do
|
||||||
rows =
|
rows =
|
||||||
data_records
|
data_records
|
||||||
|> Enum.reduce([], fn {line_no, record}, acc ->
|
|> Enum.reduce_while([], fn {line_no, record}, acc ->
|
||||||
case String.trim(record) do
|
case String.trim(record) do
|
||||||
"" ->
|
"" ->
|
||||||
acc
|
{:cont, acc}
|
||||||
|
|
||||||
_ ->
|
_ ->
|
||||||
parsed = parser.parse_string(ensure_trailing_newline(record), skip_headers: false)
|
parsed = parser.parse_string(ensure_trailing_newline(record), skip_headers: false)
|
||||||
|
|
@ -168,20 +173,23 @@ defmodule Mv.Membership.Import.CsvParser do
|
||||||
case parsed do
|
case parsed do
|
||||||
[row] when is_list(row) ->
|
[row] when is_list(row) ->
|
||||||
if empty_row?(row) do
|
if empty_row?(row) do
|
||||||
acc
|
{:cont, acc}
|
||||||
else
|
else
|
||||||
[{line_no, row} | acc]
|
{:cont, [{line_no, row} | acc]}
|
||||||
end
|
end
|
||||||
|
|
||||||
# empty row or unparsable row -> treat as skipped empty row
|
# unparsable row -> return error with line number
|
||||||
_ ->
|
_ ->
|
||||||
acc
|
snippet = String.slice(record, 0, min(50, String.length(record)))
|
||||||
|
{:halt, {:error, "Failed to parse CSV data at line #{line_no}: #{inspect(snippet)}"}}
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end)
|
end)
|
||||||
|> Enum.reverse()
|
|
||||||
|
|
||||||
{:ok, rows}
|
case rows do
|
||||||
|
{:error, reason} -> {:error, reason}
|
||||||
|
rows -> {:ok, Enum.reverse(rows)}
|
||||||
|
end
|
||||||
rescue
|
rescue
|
||||||
e ->
|
e ->
|
||||||
{:error, "Failed to parse CSV: #{Exception.message(e)}"}
|
{:error, "Failed to parse CSV: #{Exception.message(e)}"}
|
||||||
|
|
@ -193,21 +201,6 @@ defmodule Mv.Membership.Import.CsvParser do
|
||||||
Enum.all?(row, fn field -> String.trim(field) == "" end)
|
Enum.all?(row, fn field -> String.trim(field) == "" end)
|
||||||
end
|
end
|
||||||
|
|
||||||
# Check if quotes are balanced in the content.
|
|
||||||
#
|
|
||||||
# This is a simple check that counts quote characters. In CSV, escaped quotes
|
|
||||||
# are represented as "", so an odd number of quotes indicates unbalanced quotes.
|
|
||||||
@spec check_balanced_quotes(binary()) :: :ok | {:error, String.t()}
|
|
||||||
defp check_balanced_quotes(content) do
|
|
||||||
quote_count = content |> :binary.bin_to_list() |> Enum.count(&(&1 == @quote))
|
|
||||||
|
|
||||||
if rem(quote_count, 2) != 0 do
|
|
||||||
{:error, "Unbalanced quotes in CSV file"}
|
|
||||||
else
|
|
||||||
:ok
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
# --- Record splitting with correct line numbers (quote-aware) ---
|
# --- Record splitting with correct line numbers (quote-aware) ---
|
||||||
#
|
#
|
||||||
# This splits the CSV into "records" separated by newline *outside quotes*,
|
# This splits the CSV into "records" separated by newline *outside quotes*,
|
||||||
|
|
|
||||||
|
|
@ -109,7 +109,7 @@ defmodule Mv.Membership.Import.CsvParserTest do
|
||||||
assert {:ok, headers, rows} = CsvParser.parse(csv_content)
|
assert {:ok, headers, rows} = CsvParser.parse(csv_content)
|
||||||
|
|
||||||
assert headers == ["email"]
|
assert headers == ["email"]
|
||||||
# Lines 2, 3, 4 are empty (skipped), line 4 has data
|
# Lines 2 & 3 are empty (skipped), line 4 has data
|
||||||
assert rows == [{4, ["john@example.com"]}]
|
assert rows == [{4, ["john@example.com"]}]
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
@ -156,6 +156,19 @@ defmodule Mv.Membership.Import.CsvParserTest do
|
||||||
assert headers == ["email", "name"]
|
assert headers == ["email", "name"]
|
||||||
assert rows == [{2, ["john@example.com", "John \"Johnny\" Doe"]}]
|
assert rows == [{2, ["john@example.com", "John \"Johnny\" Doe"]}]
|
||||||
end
|
end
|
||||||
|
|
||||||
|
test "handles multiline quoted fields with correct line numbering" do
|
||||||
|
# Header line 1
|
||||||
|
# Data record starts line 2, contains "foo\nbar" in a field
|
||||||
|
# Record ends physically at line 3
|
||||||
|
# Expected: row gets line number 2 (start line)
|
||||||
|
csv_content = "email;description\njohn@example.com;\"foo\nbar\""
|
||||||
|
|
||||||
|
assert {:ok, headers, rows} = CsvParser.parse(csv_content)
|
||||||
|
|
||||||
|
assert headers == ["email", "description"]
|
||||||
|
assert rows == [{2, ["john@example.com", "foo\nbar"]}]
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
describe "error handling" do
|
describe "error handling" do
|
||||||
|
|
@ -170,12 +183,23 @@ defmodule Mv.Membership.Import.CsvParserTest do
|
||||||
assert reason =~ "CSV file is empty"
|
assert reason =~ "CSV file is empty"
|
||||||
end
|
end
|
||||||
|
|
||||||
test "returns {:error, reason} for invalid CSV format" do
|
test "returns {:error, reason} for invalid UTF-8 content" do
|
||||||
# Unbalanced quotes
|
# Invalid UTF-8 sequence
|
||||||
csv_content = "email;name\n\"john@example.com;John"
|
invalid_utf8 = <<0xFF, 0xFE, 0xFD>>
|
||||||
|
|
||||||
|
assert {:error, reason} = CsvParser.parse(invalid_utf8)
|
||||||
|
assert reason =~ "UTF-8"
|
||||||
|
end
|
||||||
|
|
||||||
|
test "returns {:error, reason} for unparsable data row" do
|
||||||
|
# Malformed CSV row that cannot be parsed
|
||||||
|
# NimbleCSV will throw an exception for unclosed quotes
|
||||||
|
csv_content = "email;name\njohn@example.com;\"unclosed quote"
|
||||||
|
|
||||||
assert {:error, reason} = CsvParser.parse(csv_content)
|
assert {:error, reason} = CsvParser.parse(csv_content)
|
||||||
assert is_binary(reason)
|
assert is_binary(reason)
|
||||||
|
# Error message should indicate parsing failure
|
||||||
|
assert reason =~ "parse" or reason =~ "CSV"
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue