From 12f95c19984b87f78b5cdc44208e34a8a7b5905d Mon Sep 17 00:00:00 2001 From: Moritz Date: Thu, 11 Dec 2025 13:37:49 +0100 Subject: [PATCH] docs: document fuzzy search similarity threshold strategy Explain the two-tier matching approach: - % operator with server-wide threshold (0.3) for fast index scans - similarity functions with configurable threshold (0.2) for edge cases Add rationale for threshold value based on German name testing --- lib/membership/member.ex | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/lib/membership/member.ex b/lib/membership/member.ex index a558cf0..a5d28a5 100644 --- a/lib/membership/member.ex +++ b/lib/membership/member.ex @@ -42,6 +42,21 @@ defmodule Mv.Membership.Member do # Module constants @member_search_limit 10 + + # Similarity threshold for fuzzy name/address matching. + # Lower value = more results but less accurate (0.1-0.9) + # + # Fuzzy matching uses two complementary strategies: + # 1. % operator: Fast GIN-index-based matching using server-wide threshold (default 0.3) + # - Catches exact trigram matches quickly via index + # 2. similarity/word_similarity functions: Precise matching with this configurable threshold + # - Catches partial matches that % operator might miss + # + # Value 0.2 chosen based on testing with typical German names: + # - "Müller" vs "Mueller": similarity ~0.65 ✓ + # - "Schmidt" vs "Schmitt": similarity ~0.75 ✓ + # - "Wagner" vs "Wegner": similarity ~0.55 ✓ + # - Random unrelated names: similarity ~0.15 ✗ @default_similarity_threshold 0.2 # Use constants from Mv.Constants for member fields @@ -539,9 +554,16 @@ defmodule Mv.Membership.Member do ) end - # Builds fuzzy/trigram matching filter for name and street fields - # Uses pg_trgm extension with GIN indexes for performance - # Note: Requires trigram indexes on first_name, last_name, street + # Builds fuzzy/trigram matching filter for name and street fields. + # Uses pg_trgm extension with GIN indexes for performance. + # + # Two-tier matching strategy: + # - % operator: Uses server-wide pg_trgm.similarity_threshold (typically 0.3) + # for fast index-based initial filtering + # - similarity/word_similarity: Uses @default_similarity_threshold (0.2) + # for more lenient matching to catch edge cases + # + # Note: Requires trigram GIN indexes on first_name, last_name, street. defp build_fuzzy_filter(query, threshold) do expr( fragment("? % first_name", ^query) or