From 67072f0c526f357f31710e6c4d968922543bc974 Mon Sep 17 00:00:00 2001 From: carla Date: Thu, 15 Jan 2026 16:11:09 +0100 Subject: [PATCH] feat: adds header header normalization --- lib/mv/membership/import/header_mapper.ex | 380 ++++++++++++++++++++++ lib/mv/membership/import/member_csv.ex | 355 +++++++++++++++++++- 2 files changed, 720 insertions(+), 15 deletions(-) create mode 100644 lib/mv/membership/import/header_mapper.ex diff --git a/lib/mv/membership/import/header_mapper.ex b/lib/mv/membership/import/header_mapper.ex new file mode 100644 index 0000000..caf7245 --- /dev/null +++ b/lib/mv/membership/import/header_mapper.ex @@ -0,0 +1,380 @@ +defmodule Mv.Membership.Import.HeaderMapper do + @moduledoc """ + Maps CSV headers to canonical member fields and custom fields. + + Provides header normalization and mapping functionality for CSV imports. + Handles bilingual header variants (English/German) and custom field detection. + + ## Header Normalization + + Headers are normalized using the following rules: + - Trim whitespace + - Convert to lowercase + - Unicode normalization (ß → ss, ä → ae, ö → oe, ü → ue) + - Remove all whitespace (ensures "first name" == "firstname") + - Unify hyphen variants (en dash, minus sign → standard hyphen) + - Remove or unify punctuation (parentheses, slashes → spaces) + + ## Member Field Mapping + + Maps CSV headers to canonical member fields: + - `email` (required) + - `first_name` (optional) + - `last_name` (optional) + - `street` (optional) + - `postal_code` (optional) + - `city` (optional) + + Supports both English and German variants (e.g., "Email" / "E-Mail", "First Name" / "Vorname"). + + ## Custom Field Detection + + Custom fields are detected by matching normalized header names to custom field names. + Member fields have priority over custom fields (member field wins in case of collision). + + ## Examples + + iex> HeaderMapper.normalize_header(" E-Mail ") + "e-mail" + + iex> HeaderMapper.build_maps(["Email", "First Name"], []) + {:ok, %{member: %{email: 0, first_name: 1}, custom: %{}, unknown: []}} + + iex> HeaderMapper.build_maps(["Email", "CustomField"], [%{id: "cf1", name: "CustomField"}]) + {:ok, %{member: %{email: 0}, custom: %{"cf1" => 1}, unknown: []}} + """ + + @type column_map :: %{atom() => non_neg_integer()} + @type custom_field_map :: %{String.t() => non_neg_integer()} + @type unknown_headers :: [String.t()] + + # Required member fields + @required_member_fields [:email] + + # Canonical member fields with their raw variants + # These will be normalized at runtime when building the lookup map + @member_field_variants_raw %{ + email: [ + "email", + "e-mail", + "e_mail", + "e mail", + "e-mail adresse", + "e-mail-adresse", + "mail" + ], + first_name: [ + "first name", + "firstname", + "vorname" + ], + last_name: [ + "last name", + "lastname", + "surname", + "nachname", + "familienname" + ], + street: [ + "street", + "address", + "strasse" + ], + postal_code: [ + "postal code", + "postal_code", + "zip", + "postcode", + "plz", + "postleitzahl" + ], + city: [ + "city", + "town", + "stadt", + "ort" + ] + } + + # Build reverse map: normalized_variant -> canonical_field + # This is computed at runtime on first access and cached + defp normalized_to_canonical do + @member_field_variants_raw + |> Enum.flat_map(fn {canonical, variants} -> + Enum.map(variants, fn variant -> + {normalize_header(variant), canonical} + end) + end) + |> Map.new() + end + + @doc """ + Normalizes a CSV header string for comparison. + + Applies the following transformations: + - Trim whitespace + - Convert to lowercase + - Unicode transliteration (ß → ss, ä → ae, ö → oe, ü → ue) + - Unify hyphen variants (en dash U+2013, minus sign U+2212 → standard hyphen) + - Remove or unify punctuation (parentheses, slashes → spaces) + - Remove all whitespace (ensures "first name" == "firstname") + - Final trim + + ## Examples + + iex> normalize_header(" E-Mail ") + "e-mail" + + iex> normalize_header("Straße") + "strasse" + + iex> normalize_header("E-Mail (privat)") + "e-mailprivat" + + iex> normalize_header("First Name") + "firstname" + + """ + @spec normalize_header(String.t()) :: String.t() + def normalize_header(header) when is_binary(header) do + header + |> String.trim() + |> String.downcase() + |> transliterate_unicode() + |> unify_hyphens() + |> normalize_punctuation() + |> compress_whitespace() + |> String.trim() + end + + def normalize_header(_), do: "" + + @doc """ + Builds column maps for member fields and custom fields from CSV headers. + + ## Parameters + + - `headers` - List of CSV header strings (in column order, 0-based indices) + - `custom_fields` - List of custom field maps/structs with at least `:id` and `:name` keys + + ## Returns + + - `{:ok, %{member: column_map, custom: custom_field_map, unknown: unknown_headers}}` on success + - `{:error, reason}` on error (missing required field, duplicate headers) + + ## Examples + + iex> build_maps(["Email", "First Name"], []) + {:ok, %{member: %{email: 0, first_name: 1}, custom: %{}, unknown: []}} + + iex> build_maps(["Email", "CustomField"], [%{id: "cf1", name: "CustomField"}]) + {:ok, %{member: %{email: 0}, custom: %{"cf1" => 1}, unknown: []}} + + """ + @spec build_maps([String.t()], [map()]) :: + {:ok, %{member: column_map(), custom: custom_field_map(), unknown: unknown_headers()}} + | {:error, String.t()} + def build_maps(headers, custom_fields) when is_list(headers) and is_list(custom_fields) do + with {:ok, member_map, unknown_after_member} <- build_member_map(headers), + {:ok, custom_map, unknown_after_custom} <- + build_custom_field_map(headers, unknown_after_member, custom_fields, member_map) do + unknown = Enum.map(unknown_after_custom, &Enum.at(headers, &1)) + {:ok, %{member: member_map, custom: custom_map, unknown: unknown}} + end + end + + # --- Private Functions --- + + # Transliterates German umlauts and special characters + defp transliterate_unicode(str) do + str + |> String.replace("ß", "ss") + |> String.replace("ä", "ae") + |> String.replace("ö", "oe") + |> String.replace("ü", "ue") + |> String.replace("Ä", "ae") + |> String.replace("Ö", "oe") + |> String.replace("Ü", "ue") + end + + # Unifies different hyphen variants to standard hyphen + defp unify_hyphens(str) do + str + # en dash + |> String.replace(<<0x2013::utf8>>, "-") + # em dash + |> String.replace(<<0x2014::utf8>>, "-") + # minus sign + |> String.replace(<<0x2212::utf8>>, "-") + end + + # Normalizes punctuation: parentheses, slashes become spaces + defp normalize_punctuation(str) do + str + |> String.replace(~r/[()\[\]{}]/, " ") + |> String.replace(~r/[\/\\]/, " ") + end + + # Compresses multiple whitespace characters to single space, then removes all spaces + # This ensures "first name" and "firstname" normalize to the same value + defp compress_whitespace(str) do + str + |> String.replace(~r/\s+/, " ") + |> String.replace(" ", "") + end + + # Builds member field column map + defp build_member_map(headers) do + result = + headers + |> Enum.with_index() + |> Enum.reduce_while({%{}, [], %{}}, fn {header, index}, {acc_map, acc_unknown, acc_seen} -> + normalized = normalize_header(header) + + case process_member_header(header, index, normalized, acc_map, acc_seen) do + {:error, reason} -> + {:halt, {:error, reason}} + + {:ok, new_map, new_seen} -> + {:cont, {new_map, acc_unknown, new_seen}} + + {:unknown} -> + {:cont, {acc_map, [index | acc_unknown], acc_seen}} + end + end) + + case result do + {:error, reason} -> + {:error, reason} + + {member_map, unknown_indices, _normalized_seen} -> + validate_required_fields(member_map, unknown_indices) + end + end + + # Processes a single header for member field mapping + defp process_member_header(_header, _index, normalized, acc_map, acc_seen) + when normalized == "" do + {:ok, acc_map, acc_seen} + end + + defp process_member_header(_header, index, normalized, acc_map, acc_seen) do + if Map.has_key?(normalized_to_canonical(), normalized) do + canonical = normalized_to_canonical()[normalized] + + if Map.has_key?(acc_map, canonical) do + {:error, "duplicate header for #{canonical} (normalized: #{normalized})"} + else + {:ok, Map.put(acc_map, canonical, index), Map.put(acc_seen, normalized, canonical)} + end + else + {:unknown} + end + end + + # Validates that all required member fields are present + defp validate_required_fields(member_map, unknown_indices) do + missing_required = + @required_member_fields + |> Enum.filter(&(not Map.has_key?(member_map, &1))) + + if Enum.empty?(missing_required) do + {:ok, member_map, Enum.reverse(unknown_indices)} + else + missing_field = List.first(missing_required) + variants = Map.get(@member_field_variants_raw, missing_field, []) + accepted = Enum.join(variants, ", ") + + {:error, "Missing required header: #{missing_field} (accepted: #{accepted})"} + end + end + + # Builds custom field column map from unmatched headers + defp build_custom_field_map(headers, unknown_indices, custom_fields, _member_map) do + custom_field_lookup = build_custom_field_lookup(custom_fields) + + result = + unknown_indices + |> Enum.reduce_while({%{}, [], %{}}, fn index, {acc_map, acc_unknown, acc_seen} -> + header = Enum.at(headers, index) + normalized = normalize_header(header) + + case process_custom_field_header( + header, + index, + normalized, + custom_field_lookup, + acc_map, + acc_seen + ) do + {:error, reason} -> + {:halt, {:error, reason}} + + {:ok, new_map, new_seen} -> + {:cont, {new_map, acc_unknown, new_seen}} + + {:unknown} -> + {:cont, {acc_map, [index | acc_unknown], acc_seen}} + end + end) + + case result do + {:error, reason} -> + {:error, reason} + + {custom_map, remaining_unknown, _normalized_seen} -> + {:ok, custom_map, Enum.reverse(remaining_unknown)} + end + end + + # Builds normalized custom field name -> id lookup map + defp build_custom_field_lookup(custom_fields) do + custom_fields + |> Enum.reduce(%{}, fn cf, acc -> + name = Map.get(cf, :name) || Map.get(cf, "name") + id = Map.get(cf, :id) || Map.get(cf, "id") + + if name && id do + normalized_name = normalize_header(name) + Map.put(acc, normalized_name, id) + else + acc + end + end) + end + + # Processes a single header for custom field mapping + defp process_custom_field_header( + _header, + _index, + normalized, + _custom_field_lookup, + acc_map, + acc_seen + ) + when normalized == "" do + {:ok, acc_map, acc_seen} + end + + defp process_custom_field_header( + _header, + index, + normalized, + custom_field_lookup, + acc_map, + acc_seen + ) do + if Map.has_key?(custom_field_lookup, normalized) do + custom_field_id = custom_field_lookup[normalized] + + if Map.has_key?(acc_map, custom_field_id) do + {:error, "duplicate custom field header (normalized: #{normalized})"} + else + {:ok, Map.put(acc_map, custom_field_id, index), + Map.put(acc_seen, normalized, custom_field_id)} + end + else + {:unknown} + end + end +end diff --git a/lib/mv/membership/import/member_csv.ex b/lib/mv/membership/import/member_csv.ex index 9e30a20..7790bff 100644 --- a/lib/mv/membership/import/member_csv.ex +++ b/lib/mv/membership/import/member_csv.ex @@ -2,6 +2,8 @@ defmodule Mv.Membership.Import.MemberCSV do @moduledoc """ Service module for importing members from CSV files. + require Ash.Query + This module provides the core API for CSV member import functionality: - `prepare/2` - Parses and validates CSV content, returns import state - `process_chunk/3` - Processes a chunk of rows and creates members @@ -70,6 +72,9 @@ defmodule Mv.Membership.Import.MemberCSV do errors: list(Error.t()) } + alias Mv.Membership.Import.CsvParser + alias Mv.Membership.Import.HeaderMapper + @doc """ Prepares CSV content for import by parsing, mapping headers, and validating limits. @@ -104,12 +109,120 @@ defmodule Mv.Membership.Import.MemberCSV do """ @spec prepare(String.t(), keyword()) :: {:ok, import_state()} | {:error, String.t()} def prepare(file_content, opts \\ []) do - # TODO: Implement in Issue #3 (CSV Parsing) - # This is a skeleton implementation that will be filled in later - _ = {file_content, opts} + max_rows = Keyword.get(opts, :max_rows, 1000) + chunk_size = Keyword.get(opts, :chunk_size, 200) - # Placeholder return - will be replaced with actual implementation - {:error, "Not yet implemented"} + with {:ok, headers, rows} <- CsvParser.parse(file_content), + {:ok, custom_fields} <- load_custom_fields(), + {:ok, maps, warnings} <- build_header_maps(headers, custom_fields), + :ok <- validate_row_count(rows, max_rows) do + chunks = chunk_rows(rows, maps, chunk_size) + + {:ok, + %{ + chunks: chunks, + column_map: maps.member, + custom_field_map: maps.custom, + warnings: warnings + }} + end + end + + # Loads all custom fields from the database + defp load_custom_fields do + custom_fields = + Mv.Membership.CustomField + |> Ash.read!() + + {:ok, custom_fields} + rescue + e -> + {:error, "Failed to load custom fields: #{Exception.message(e)}"} + end + + # Builds header maps using HeaderMapper and collects warnings for unknown custom fields + defp build_header_maps(headers, custom_fields) do + # Convert custom fields to maps with id and name + custom_field_maps = + Enum.map(custom_fields, fn cf -> + %{id: to_string(cf.id), name: cf.name} + end) + + case HeaderMapper.build_maps(headers, custom_field_maps) do + {:ok, %{member: member_map, custom: custom_map, unknown: unknown}} -> + # Build warnings for unknown custom field columns + warnings = + unknown + |> Enum.filter(fn header -> + # Check if it could be a custom field (not a known member field) + normalized = HeaderMapper.normalize_header(header) + # If it's not empty and not a member field, it might be a custom field + normalized != "" && not member_field?(normalized) + end) + |> Enum.map(fn header -> + "Unknown column '#{header}' will be ignored. " <> + "If this is a custom field, create it in Mila before importing." + end) + + {:ok, %{member: member_map, custom: custom_map}, warnings} + + {:error, reason} -> + {:error, reason} + end + end + + # Checks if a normalized header matches a member field + # Uses HeaderMapper's internal logic to check if header would map to a member field + defp member_field?(normalized) do + # Try to build maps with just this header - if it maps to a member field, it's a member field + case HeaderMapper.build_maps([normalized], []) do + {:ok, %{member: member_map}} -> + # If member_map is not empty, it's a member field + map_size(member_map) > 0 + + _ -> + false + end + end + + # Validates that row count doesn't exceed limit + defp validate_row_count(rows, max_rows) do + if length(rows) > max_rows do + {:error, "CSV file exceeds maximum row limit of #{max_rows} rows"} + else + :ok + end + end + + # Chunks rows and converts them to row maps using column maps + defp chunk_rows(rows, maps, chunk_size) do + rows + |> Enum.chunk_every(chunk_size) + |> Enum.map(fn chunk -> + Enum.map(chunk, fn {line_number, row_values} -> + row_map = build_row_map(row_values, maps) + {line_number, row_map} + end) + end) + end + + # Builds a row map from raw row values using column maps + defp build_row_map(row_values, maps) do + member_map = + maps.member + |> Enum.reduce(%{}, fn {field, index}, acc -> + value = Enum.at(row_values, index, "") + Map.put(acc, field, value) + end) + + custom_map = + maps.custom + |> Enum.reduce(%{}, fn {custom_field_id, index}, acc -> + value = Enum.at(row_values, index, "") + Map.put(acc, custom_field_id, value) + end) + + %{member: member_map, custom: custom_map} end @doc """ @@ -126,8 +239,9 @@ defmodule Mv.Membership.Import.MemberCSV do - `chunk_rows_with_lines` - List of tuples `{csv_line_number, row_map}` where: - `csv_line_number` - Physical line number in CSV (1-based) - - `row_map` - Map of column names to values - - `column_map` - Map of canonical field names (atoms) to column indices + - `row_map` - Map with `:member` and `:custom` keys containing field values + - `column_map` - Map of canonical field names (atoms) to column indices (for reference) + - `custom_field_map` - Map of custom field IDs (strings) to column indices (for reference) - `opts` - Optional keyword list for processing options ## Returns @@ -137,22 +251,233 @@ defmodule Mv.Membership.Import.MemberCSV do ## Examples - iex> chunk = [{2, %{"email" => "john@example.com"}}] + iex> chunk = [{2, %{member: %{email: "john@example.com"}, custom: %{}}}] iex> column_map = %{email: 0} - iex> MemberCSV.process_chunk(chunk, column_map) + iex> custom_field_map = %{} + iex> MemberCSV.process_chunk(chunk, column_map, custom_field_map) {:ok, %{inserted: 1, failed: 0, errors: []}} """ @spec process_chunk( list({pos_integer(), map()}), %{atom() => non_neg_integer()}, + %{String.t() => non_neg_integer()}, keyword() ) :: {:ok, chunk_result()} | {:error, String.t()} - def process_chunk(chunk_rows_with_lines, column_map, opts \\ []) do - # TODO: Implement in Issue #6 (Persistence) - # This is a skeleton implementation that will be filled in later - _ = {chunk_rows_with_lines, column_map, opts} + def process_chunk(chunk_rows_with_lines, _column_map, _custom_field_map, _opts \\ []) do + {inserted, failed, errors} = + Enum.reduce(chunk_rows_with_lines, {0, 0, []}, fn {line_number, row_map}, + {acc_inserted, acc_failed, acc_errors} -> + case process_row(row_map, line_number) do + {:ok, _member} -> + {acc_inserted + 1, acc_failed, acc_errors} - # Placeholder return - will be replaced with actual implementation - {:ok, %{inserted: 0, failed: 0, errors: []}} + {:error, error} -> + {acc_inserted, acc_failed + 1, [error | acc_errors]} + end + end) + + {:ok, %{inserted: inserted, failed: failed, errors: Enum.reverse(errors)}} end + + # Processes a single row and creates member with custom field values + defp process_row(%{member: member_attrs, custom: custom_attrs}, line_number) do + # Prepare custom field values for Ash + custom_field_values = prepare_custom_field_values(custom_attrs) + + # Create member with custom field values + member_attrs_with_cf = + member_attrs + |> Map.put(:custom_field_values, custom_field_values) + |> trim_string_values() + + # Only include custom_field_values if not empty + final_attrs = + if Enum.empty?(custom_field_values) do + Map.delete(member_attrs_with_cf, :custom_field_values) + else + member_attrs_with_cf + end + + case Mv.Membership.create_member(final_attrs) do + {:ok, member} -> + {:ok, member} + + {:error, %Ash.Error.Invalid{} = error} -> + {:error, format_ash_error(error, line_number)} + + {:error, error} -> + {:error, %Error{csv_line_number: line_number, field: nil, message: inspect(error)}} + end + rescue + e -> + {:error, %Error{csv_line_number: line_number, field: nil, message: Exception.message(e)}} + end + + # Prepares custom field values from row map for Ash + defp prepare_custom_field_values(custom_attrs) when is_map(custom_attrs) do + custom_attrs + |> Enum.filter(fn {_id, value} -> value != nil && value != "" end) + |> Enum.map(fn {custom_field_id_str, value} -> + # Load custom field to get value_type and ensure ID is correct + case load_custom_field_by_id(custom_field_id_str) do + {:ok, custom_field} -> + # Use the actual custom_field.id (UUID) from the database + %{ + "custom_field_id" => to_string(custom_field.id), + "value" => format_custom_field_value(value, custom_field.value_type) + } + + {:error, _} -> + # Skip if custom field not found + nil + end + end) + |> Enum.filter(&(&1 != nil)) + end + + defp prepare_custom_field_values(_), do: [] + + # Loads a custom field by ID (string or UUID) + defp load_custom_field_by_id(id) when is_binary(id) do + require Ash.Query + + try do + # Try to parse as UUID first + uuid_id = + case Ecto.UUID.cast(id) do + {:ok, uuid} -> uuid + :error -> id + end + + custom_field = + Mv.Membership.CustomField + |> Ash.Query.filter(id == ^uuid_id) + |> Ash.read_one!() + + {:ok, custom_field} + rescue + _ -> {:error, :not_found} + end + end + + defp load_custom_field_by_id(_), do: {:error, :invalid_id} + + # Formats a custom field value according to its type + # Uses _union_type and _union_value format as expected by Ash + defp format_custom_field_value(value, :string) when is_binary(value) do + %{"_union_type" => "string", "_union_value" => String.trim(value)} + end + + defp format_custom_field_value(value, :integer) when is_binary(value) do + case Integer.parse(value) do + {int_value, _} -> %{"_union_type" => "integer", "_union_value" => int_value} + :error -> %{"_union_type" => "string", "_union_value" => String.trim(value)} + end + end + + defp format_custom_field_value(value, :boolean) when is_binary(value) do + bool_value = + value + |> String.trim() + |> String.downcase() + |> case do + "true" -> true + "1" -> true + "yes" -> true + "ja" -> true + _ -> false + end + + %{"_union_type" => "boolean", "_union_value" => bool_value} + end + + defp format_custom_field_value(value, :date) when is_binary(value) do + case Date.from_iso8601(String.trim(value)) do + {:ok, date} -> %{"_union_type" => "date", "_union_value" => date} + {:error, _} -> %{"_union_type" => "string", "_union_value" => String.trim(value)} + end + end + + defp format_custom_field_value(value, :email) when is_binary(value) do + %{"_union_type" => "email", "_union_value" => String.trim(value)} + end + + defp format_custom_field_value(value, _type) when is_binary(value) do + # Default to string if type is unknown + %{"_union_type" => "string", "_union_value" => String.trim(value)} + end + + # Trims all string values in member attributes + defp trim_string_values(attrs) do + Enum.reduce(attrs, %{}, fn {key, value}, acc -> + trimmed_value = + if is_binary(value) do + String.trim(value) + else + value + end + + Map.put(acc, key, trimmed_value) + end) + end + + # Formats Ash errors into MemberCSV.Error structs + defp format_ash_error(%Ash.Error.Invalid{errors: errors}, line_number) do + # Try to find email-related errors first (for better error messages) + email_error = + Enum.find(errors, fn error -> + case error do + %{field: :email} -> true + _ -> false + end + end) + + case email_error || List.first(errors) do + %{field: field, message: message} when is_atom(field) -> + %Error{ + csv_line_number: line_number, + field: field, + message: format_error_message(message, field) + } + + %{message: message} -> + %Error{ + csv_line_number: line_number, + field: nil, + message: format_error_message(message, nil) + } + + _ -> + %Error{ + csv_line_number: line_number, + field: nil, + message: "Validation failed" + } + end + end + + # Formats error messages, handling common cases like email uniqueness + defp format_error_message(message, field) when is_binary(message) do + if email_uniqueness_error?(message, field) do + "email has already been taken" + else + message + end + end + + defp format_error_message(message, _field), do: to_string(message) + + # Checks if error message indicates email uniqueness constraint violation + defp email_uniqueness_error?(message, :email) do + message_lower = String.downcase(message) + + String.contains?(message_lower, "unique") or + String.contains?(message_lower, "constraint") or + String.contains?(message_lower, "duplicate") or + String.contains?(message_lower, "already been taken") or + String.contains?(message_lower, "already exists") or + String.contains?(message_lower, "violates unique constraint") + end + + defp email_uniqueness_error?(_message, _field), do: false end