/* * The MIT License * * Copyright 2016 feature[23] * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ using System; using System.Linq; // ReSharper disable SuggestVarOrType_Elsewhere // ReSharper disable LoopCanBeConvertedToQuery namespace StringMetric { /// The Jaro–Winkler distance metric is designed and best suited for short /// strings such as person names, and to detect typos; it is (roughly) a /// variation of Damerau-Levenshtein, where the substitution of 2 close /// characters is considered less important then the substitution of 2 characters /// that a far from each other. /// Jaro-Winkler was developed in the area of record linkage (duplicate /// detection) (Winkler, 1990). It returns a value in the interval [0.0, 1.0]. /// The distance is computed as 1 - Jaro-Winkler similarity. public class JaroWinkler { private const double DEFAULT_THRESHOLD = 0.7; private const int THREE = 3; private const double JW_COEF = 0.1; /// /// The current value of the threshold used for adding the Winkler bonus. The default value is 0.7. /// private double Threshold { get; } /// /// Creates a new instance with default threshold (0.7) /// public JaroWinkler() { Threshold = DEFAULT_THRESHOLD; } /// /// Creates a new instance with given threshold to determine when Winkler bonus should /// be used. Set threshold to a negative value to get the Jaro distance. /// /// public JaroWinkler(double threshold) { Threshold = threshold; } /// /// Compute Jaro-Winkler similarity. /// /// The first string to compare. /// The second string to compare. /// The Jaro-Winkler similarity in the range [0, 1] /// If s1 or s2 is null. public double Similarity(string s1, string s2) { if (s1 == null) { throw new ArgumentNullException(nameof(s1)); } if (s2 == null) { throw new ArgumentNullException(nameof(s2)); } if (s1.Equals(s2)) { return 1f; } int[] mtp = Matches(s1, s2); float m = mtp[0]; if (m == 0) { return 0f; } double j = ((m / s1.Length + m / s2.Length + (m - mtp[1]) / m)) / THREE; double jw = j; if (j > Threshold) { jw = j + Math.Min(JW_COEF, 1.0 / mtp[THREE]) * mtp[2] * (1 - j); } return jw; } /// /// Return 1 - similarity. /// /// The first string to compare. /// The second string to compare. /// 1 - similarity /// If s1 or s2 is null. public double Distance(string s1, string s2) => 1.0 - Similarity(s1, s2); private static int[] Matches(string s1, string s2) { string max, min; if (s1.Length > s2.Length) { max = s1; min = s2; } else { max = s2; min = s1; } int range = Math.Max(max.Length / 2 - 1, 0); //int[] matchIndexes = new int[min.Length]; //Arrays.fill(matchIndexes, -1); int[] match_indexes = Enumerable.Repeat(-1, min.Length).ToArray(); bool[] match_flags = new bool[max.Length]; int matches = 0; for (int mi = 0; mi < min.Length; mi++) { char c1 = min[mi]; for (int xi = Math.Max(mi - range, 0), xn = Math.Min(mi + range + 1, max.Length); xi < xn; xi++) { if (!match_flags[xi] && c1 == max[xi]) { match_indexes[mi] = xi; match_flags[xi] = true; matches++; break; } } } char[] ms1 = new char[matches]; char[] ms2 = new char[matches]; for (int i = 0, si = 0; i < min.Length; i++) { if (match_indexes[i] != -1) { ms1[si] = min[i]; si++; } } for (int i = 0, si = 0; i < max.Length; i++) { if (match_flags[i]) { ms2[si] = max[i]; si++; } } int transpositions = 0; for (int mi = 0; mi < ms1.Length; mi++) { if (ms1[mi] != ms2[mi]) { transpositions++; } } int prefix = 0; for (int mi = 0; mi < min.Length; mi++) { if (s1[mi] == s2[mi]) { prefix++; } else { break; } } return new[] { matches, transpositions / 2, prefix, max.Length }; } } }