/*
* The MIT License
*
* Copyright 2016 feature[23]
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
using System;
using System.Linq;
// ReSharper disable SuggestVarOrType_Elsewhere
// ReSharper disable LoopCanBeConvertedToQuery
namespace StringMetric
{
/// The Jaro–Winkler distance metric is designed and best suited for short
/// strings such as person names, and to detect typos; it is (roughly) a
/// variation of Damerau-Levenshtein, where the substitution of 2 close
/// characters is considered less important then the substitution of 2 characters
/// that a far from each other.
/// Jaro-Winkler was developed in the area of record linkage (duplicate
/// detection) (Winkler, 1990). It returns a value in the interval [0.0, 1.0].
/// The distance is computed as 1 - Jaro-Winkler similarity.
public class JaroWinkler
{
private const double DEFAULT_THRESHOLD = 0.7;
private const int THREE = 3;
private const double JW_COEF = 0.1;
///
/// The current value of the threshold used for adding the Winkler bonus. The default value is 0.7.
///
private double Threshold { get; }
///
/// Creates a new instance with default threshold (0.7)
///
public JaroWinkler()
{
Threshold = DEFAULT_THRESHOLD;
}
///
/// Creates a new instance with given threshold to determine when Winkler bonus should
/// be used. Set threshold to a negative value to get the Jaro distance.
///
///
public JaroWinkler(double threshold)
{
Threshold = threshold;
}
///
/// Compute Jaro-Winkler similarity.
///
/// The first string to compare.
/// The second string to compare.
/// The Jaro-Winkler similarity in the range [0, 1]
/// If s1 or s2 is null.
public double Similarity(string s1, string s2)
{
if (s1 == null)
{
throw new ArgumentNullException(nameof(s1));
}
if (s2 == null)
{
throw new ArgumentNullException(nameof(s2));
}
if (s1.Equals(s2))
{
return 1f;
}
int[] mtp = Matches(s1, s2);
float m = mtp[0];
if (m == 0)
{
return 0f;
}
double j = ((m / s1.Length + m / s2.Length + (m - mtp[1]) / m))
/ THREE;
double jw = j;
if (j > Threshold)
{
jw = j + Math.Min(JW_COEF, 1.0 / mtp[THREE]) * mtp[2] * (1 - j);
}
return jw;
}
///
/// Return 1 - similarity.
///
/// The first string to compare.
/// The second string to compare.
/// 1 - similarity
/// If s1 or s2 is null.
public double Distance(string s1, string s2)
=> 1.0 - Similarity(s1, s2);
private static int[] Matches(string s1, string s2)
{
string max, min;
if (s1.Length > s2.Length)
{
max = s1;
min = s2;
}
else
{
max = s2;
min = s1;
}
int range = Math.Max(max.Length / 2 - 1, 0);
//int[] matchIndexes = new int[min.Length];
//Arrays.fill(matchIndexes, -1);
int[] match_indexes = Enumerable.Repeat(-1, min.Length).ToArray();
bool[] match_flags = new bool[max.Length];
int matches = 0;
for (int mi = 0; mi < min.Length; mi++)
{
char c1 = min[mi];
for (int xi = Math.Max(mi - range, 0),
xn = Math.Min(mi + range + 1, max.Length); xi < xn; xi++)
{
if (!match_flags[xi] && c1 == max[xi])
{
match_indexes[mi] = xi;
match_flags[xi] = true;
matches++;
break;
}
}
}
char[] ms1 = new char[matches];
char[] ms2 = new char[matches];
for (int i = 0, si = 0; i < min.Length; i++)
{
if (match_indexes[i] != -1)
{
ms1[si] = min[i];
si++;
}
}
for (int i = 0, si = 0; i < max.Length; i++)
{
if (match_flags[i])
{
ms2[si] = max[i];
si++;
}
}
int transpositions = 0;
for (int mi = 0; mi < ms1.Length; mi++)
{
if (ms1[mi] != ms2[mi])
{
transpositions++;
}
}
int prefix = 0;
for (int mi = 0; mi < min.Length; mi++)
{
if (s1[mi] == s2[mi])
{
prefix++;
}
else
{
break;
}
}
return new[] { matches, transpositions / 2, prefix, max.Length };
}
}
}