001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.text.similarity; 018 019import java.util.Arrays; 020 021/** 022 * An algorithm for measuring the difference between two character sequences. 023 * 024 * <p> 025 * This is the number of changes needed to change one sequence into another, 026 * where each change is a single character modification (deletion, insertion 027 * or substitution). 028 * </p> 029 * 030 * @since 1.0 031 */ 032public class LevenshteinDetailedDistance implements EditDistance<LevenshteinResults> { 033 034 /** 035 * Default instance. 036 */ 037 private static final LevenshteinDetailedDistance DEFAULT_INSTANCE = new LevenshteinDetailedDistance(); 038 /** 039 * Threshold. 040 */ 041 private final Integer threshold; 042 043 /** 044 * <p> 045 * This returns the default instance that uses a version 046 * of the algorithm that does not use a threshold parameter. 047 * </p> 048 * 049 * @see LevenshteinDetailedDistance#getDefaultInstance() 050 */ 051 public LevenshteinDetailedDistance() { 052 this(null); 053 } 054 055 /** 056 * If the threshold is not null, distance calculations will be limited to a maximum length. 057 * 058 * <p>If the threshold is null, the unlimited version of the algorithm will be used.</p> 059 * 060 * @param threshold If this is null then distances calculations will not be limited. This may not be negative. 061 */ 062 public LevenshteinDetailedDistance(final Integer threshold) { 063 if (threshold != null && threshold < 0) { 064 throw new IllegalArgumentException("Threshold must not be negative"); 065 } 066 this.threshold = threshold; 067 } 068 069 /** 070 * <p>Find the Levenshtein distance between two Strings.</p> 071 * 072 * <p>A higher score indicates a greater distance.</p> 073 * 074 * <p>The previous implementation of the Levenshtein distance algorithm 075 * was from <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a></p> 076 * 077 * <p>Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError 078 * which can occur when my Java implementation is used with very large strings.<br> 079 * This implementation of the Levenshtein distance algorithm 080 * is from <a href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ldjava.htm</a></p> 081 * 082 * <pre> 083 * distance.apply(null, *) = IllegalArgumentException 084 * distance.apply(*, null) = IllegalArgumentException 085 * distance.apply("","") = 0 086 * distance.apply("","a") = 1 087 * distance.apply("aaapppp", "") = 7 088 * distance.apply("frog", "fog") = 1 089 * distance.apply("fly", "ant") = 3 090 * distance.apply("elephant", "hippo") = 7 091 * distance.apply("hippo", "elephant") = 7 092 * distance.apply("hippo", "zzzzzzzz") = 8 093 * distance.apply("hello", "hallo") = 1 094 * </pre> 095 * 096 * @param left the first string, must not be null 097 * @param right the second string, must not be null 098 * @return result distance, or -1 099 * @throws IllegalArgumentException if either String input {@code null} 100 */ 101 @Override 102 public LevenshteinResults apply(final CharSequence left, final CharSequence right) { 103 if (threshold != null) { 104 return limitedCompare(left, right, threshold); 105 } 106 return unlimitedCompare(left, right); 107 } 108 109 /** 110 * Gets the default instance. 111 * 112 * @return the default instace 113 */ 114 public static LevenshteinDetailedDistance getDefaultInstance() { 115 return DEFAULT_INSTANCE; 116 } 117 118 /** 119 * Gets the distance threshold. 120 * 121 * @return the distance threshold 122 */ 123 public Integer getThreshold() { 124 return threshold; 125 } 126 127 /** 128 * Find the Levenshtein distance between two CharSequences if it's less than or 129 * equal to a given threshold. 130 * 131 * <p> 132 * This implementation follows from Algorithms on Strings, Trees and 133 * Sequences by Dan Gusfield and Chas Emerick's implementation of the 134 * Levenshtein distance algorithm from <a 135 * href="http://www.merriampark.com/ld.htm" 136 * >http://www.merriampark.com/ld.htm</a> 137 * </p> 138 * 139 * <pre> 140 * limitedCompare(null, *, *) = IllegalArgumentException 141 * limitedCompare(*, null, *) = IllegalArgumentException 142 * limitedCompare(*, *, -1) = IllegalArgumentException 143 * limitedCompare("","", 0) = 0 144 * limitedCompare("aaapppp", "", 8) = 7 145 * limitedCompare("aaapppp", "", 7) = 7 146 * limitedCompare("aaapppp", "", 6)) = -1 147 * limitedCompare("elephant", "hippo", 7) = 7 148 * limitedCompare("elephant", "hippo", 6) = -1 149 * limitedCompare("hippo", "elephant", 7) = 7 150 * limitedCompare("hippo", "elephant", 6) = -1 151 * </pre> 152 * 153 * @param left the first CharSequence, must not be null 154 * @param right the second CharSequence, must not be null 155 * @param threshold the target threshold, must not be negative 156 * @return result distance, or -1 157 */ 158 private static LevenshteinResults limitedCompare(CharSequence left, 159 CharSequence right, 160 final int threshold) { //NOPMD 161 if (left == null || right == null) { 162 throw new IllegalArgumentException("CharSequences must not be null"); 163 } 164 if (threshold < 0) { 165 throw new IllegalArgumentException("Threshold must not be negative"); 166 } 167 168 /* 169 * This implementation only computes the distance if it's less than or 170 * equal to the threshold value, returning -1 if it's greater. The 171 * advantage is performance: unbounded distance is O(nm), but a bound of 172 * k allows us to reduce it to O(km) time by only computing a diagonal 173 * stripe of width 2k + 1 of the cost table. It is also possible to use 174 * this to compute the unbounded Levenshtein distance by starting the 175 * threshold at 1 and doubling each time until the distance is found; 176 * this is O(dm), where d is the distance. 177 * 178 * One subtlety comes from needing to ignore entries on the border of 179 * our stripe eg. p[] = |#|#|#|* d[] = *|#|#|#| We must ignore the entry 180 * to the left of the leftmost member We must ignore the entry above the 181 * rightmost member 182 * 183 * Another subtlety comes from our stripe running off the matrix if the 184 * strings aren't of the same size. Since string s is always swapped to 185 * be the shorter of the two, the stripe will always run off to the 186 * upper right instead of the lower left of the matrix. 187 * 188 * As a concrete example, suppose s is of length 5, t is of length 7, 189 * and our threshold is 1. In this case we're going to walk a stripe of 190 * length 3. The matrix would look like so: 191 * 192 * <pre> 193 * 1 2 3 4 5 194 * 1 |#|#| | | | 195 * 2 |#|#|#| | | 196 * 3 | |#|#|#| | 197 * 4 | | |#|#|#| 198 * 5 | | | |#|#| 199 * 6 | | | | |#| 200 * 7 | | | | | | 201 * </pre> 202 * 203 * Note how the stripe leads off the table as there is no possible way 204 * to turn a string of length 5 into one of length 7 in edit distance of 205 * 1. 206 * 207 * Additionally, this implementation decreases memory usage by using two 208 * single-dimensional arrays and swapping them back and forth instead of 209 * allocating an entire n by m matrix. This requires a few minor 210 * changes, such as immediately returning when it's detected that the 211 * stripe has run off the matrix and initially filling the arrays with 212 * large values so that entries we don't compute are ignored. 213 * 214 * See Algorithms on Strings, Trees and Sequences by Dan Gusfield for 215 * some discussion. 216 */ 217 218 int n = left.length(); // length of left 219 int m = right.length(); // length of right 220 221 // if one string is empty, the edit distance is necessarily the length of the other 222 if (n == 0) { 223 return m <= threshold ? new LevenshteinResults(m, m, 0, 0) : new LevenshteinResults(-1, 0, 0, 0); 224 } else if (m == 0) { 225 return n <= threshold ? new LevenshteinResults(n, 0, n, 0) : new LevenshteinResults(-1, 0, 0, 0); 226 } 227 228 boolean swapped = false; 229 if (n > m) { 230 // swap the two strings to consume less memory 231 final CharSequence tmp = left; 232 left = right; 233 right = tmp; 234 n = m; 235 m = right.length(); 236 swapped = true; 237 } 238 239 int[] p = new int[n + 1]; // 'previous' cost array, horizontally 240 int[] d = new int[n + 1]; // cost array, horizontally 241 int[] tempD; // placeholder to assist in swapping p and d 242 final int[][] matrix = new int[m + 1][n + 1]; 243 244 //filling the first row and first column values in the matrix 245 for (int index = 0; index <= n; index++) { 246 matrix[0][index] = index; 247 } 248 for (int index = 0; index <= m; index++) { 249 matrix[index][0] = index; 250 } 251 252 // fill in starting table values 253 final int boundary = Math.min(n, threshold) + 1; 254 for (int i = 0; i < boundary; i++) { 255 p[i] = i; 256 } 257 // these fills ensure that the value above the rightmost entry of our 258 // stripe will be ignored in following loop iterations 259 Arrays.fill(p, boundary, p.length, Integer.MAX_VALUE); 260 Arrays.fill(d, Integer.MAX_VALUE); 261 262 // iterates through t 263 for (int j = 1; j <= m; j++) { 264 final char rightJ = right.charAt(j - 1); // jth character of right 265 d[0] = j; 266 267 // compute stripe indices, constrain to array size 268 final int min = Math.max(1, j - threshold); 269 final int max = j > Integer.MAX_VALUE - threshold ? n : Math.min( 270 n, j + threshold); 271 272 // the stripe may lead off of the table if s and t are of different sizes 273 if (min > max) { 274 return new LevenshteinResults(-1, 0, 0, 0); 275 } 276 277 // ignore entry left of leftmost 278 if (min > 1) { 279 d[min - 1] = Integer.MAX_VALUE; 280 } 281 282 // iterates through [min, max] in s 283 for (int i = min; i <= max; i++) { 284 if (left.charAt(i - 1) == rightJ) { 285 // diagonally left and up 286 d[i] = p[i - 1]; 287 } else { 288 // 1 + minimum of cell to the left, to the top, diagonally left and up 289 d[i] = 1 + Math.min(Math.min(d[i - 1], p[i]), p[i - 1]); 290 } 291 matrix[j][i] = d[i]; 292 } 293 294 // copy current distance counts to 'previous row' distance counts 295 tempD = p; 296 p = d; 297 d = tempD; 298 } 299 300 // if p[n] is greater than the threshold, there's no guarantee on it being the correct distance 301 if (p[n] <= threshold) { 302 return findDetailedResults(left, right, matrix, swapped); 303 } 304 return new LevenshteinResults(-1, 0, 0, 0); 305 } 306 307 /** 308 * <p>Find the Levenshtein distance between two Strings.</p> 309 * 310 * <p>A higher score indicates a greater distance.</p> 311 * 312 * <p>The previous implementation of the Levenshtein distance algorithm 313 * was from <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a></p> 314 * 315 * <p>Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError 316 * which can occur when my Java implementation is used with very large strings.<br> 317 * This implementation of the Levenshtein distance algorithm 318 * is from <a href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ldjava.htm</a></p> 319 * 320 * <pre> 321 * unlimitedCompare(null, *) = IllegalArgumentException 322 * unlimitedCompare(*, null) = IllegalArgumentException 323 * unlimitedCompare("","") = 0 324 * unlimitedCompare("","a") = 1 325 * unlimitedCompare("aaapppp", "") = 7 326 * unlimitedCompare("frog", "fog") = 1 327 * unlimitedCompare("fly", "ant") = 3 328 * unlimitedCompare("elephant", "hippo") = 7 329 * unlimitedCompare("hippo", "elephant") = 7 330 * unlimitedCompare("hippo", "zzzzzzzz") = 8 331 * unlimitedCompare("hello", "hallo") = 1 332 * </pre> 333 * 334 * @param left the first CharSequence, must not be null 335 * @param right the second CharSequence, must not be null 336 * @return result distance, or -1 337 * @throws IllegalArgumentException if either CharSequence input is {@code null} 338 */ 339 private static LevenshteinResults unlimitedCompare(CharSequence left, CharSequence right) { 340 if (left == null || right == null) { 341 throw new IllegalArgumentException("CharSequences must not be null"); 342 } 343 344 /* 345 The difference between this impl. and the previous is that, rather 346 than creating and retaining a matrix of size s.length() + 1 by t.length() + 1, 347 we maintain two single-dimensional arrays of length s.length() + 1. The first, d, 348 is the 'current working' distance array that maintains the newest distance cost 349 counts as we iterate through the characters of String s. Each time we increment 350 the index of String t we are comparing, d is copied to p, the second int[]. Doing so 351 allows us to retain the previous cost counts as required by the algorithm (taking 352 the minimum of the cost count to the left, up one, and diagonally up and to the left 353 of the current cost count being calculated). (Note that the arrays aren't really 354 copied anymore, just switched...this is clearly much better than cloning an array 355 or doing a System.arraycopy() each time through the outer loop.) 356 357 Effectively, the difference between the two implementations is this one does not 358 cause an out of memory condition when calculating the LD over two very large strings. 359 */ 360 361 int n = left.length(); // length of left 362 int m = right.length(); // length of right 363 364 if (n == 0) { 365 return new LevenshteinResults(m, m, 0, 0); 366 } else if (m == 0) { 367 return new LevenshteinResults(n, 0, n, 0); 368 } 369 boolean swapped = false; 370 if (n > m) { 371 // swap the input strings to consume less memory 372 final CharSequence tmp = left; 373 left = right; 374 right = tmp; 375 n = m; 376 m = right.length(); 377 swapped = true; 378 } 379 380 int[] p = new int[n + 1]; // 'previous' cost array, horizontally 381 int[] d = new int[n + 1]; // cost array, horizontally 382 int[] tempD; //placeholder to assist in swapping p and d 383 final int[][] matrix = new int[m + 1][n + 1]; 384 385 // filling the first row and first column values in the matrix 386 for (int index = 0; index <= n; index++) { 387 matrix[0][index] = index; 388 } 389 for (int index = 0; index <= m; index++) { 390 matrix[index][0] = index; 391 } 392 393 // indexes into strings left and right 394 int i; // iterates through left 395 int j; // iterates through right 396 397 char rightJ; // jth character of right 398 399 int cost; // cost 400 for (i = 0; i <= n; i++) { 401 p[i] = i; 402 } 403 404 for (j = 1; j <= m; j++) { 405 rightJ = right.charAt(j - 1); 406 d[0] = j; 407 408 for (i = 1; i <= n; i++) { 409 cost = left.charAt(i - 1) == rightJ ? 0 : 1; 410 // minimum of cell to the left+1, to the top+1, diagonally left and up +cost 411 d[i] = Math.min(Math.min(d[i - 1] + 1, p[i] + 1), p[i - 1] + cost); 412 //filling the matrix 413 matrix[j][i] = d[i]; 414 } 415 416 // copy current distance counts to 'previous row' distance counts 417 tempD = p; 418 p = d; 419 d = tempD; 420 } 421 return findDetailedResults(left, right, matrix, swapped); 422 } 423 424 /** 425 * Finds count for each of the three [insert, delete, substitute] operations 426 * needed. This is based on the matrix formed based on the two character 427 * sequence. 428 * 429 * @param left character sequence which need to be converted from 430 * @param right character sequence which need to be converted to 431 * @param matrix two dimensional array containing 432 * @param swapped tells whether the value for left character sequence and right 433 * character sequence were swapped to save memory 434 * @return result object containing the count of insert, delete and substitute and total count needed 435 */ 436 private static LevenshteinResults findDetailedResults(final CharSequence left, 437 final CharSequence right, 438 final int[][] matrix, 439 final boolean swapped) { 440 441 int delCount = 0; 442 int addCount = 0; 443 int subCount = 0; 444 445 int rowIndex = right.length(); 446 int columnIndex = left.length(); 447 448 int dataAtLeft = 0; 449 int dataAtTop = 0; 450 int dataAtDiagonal = 0; 451 int data = 0; 452 boolean deleted = false; 453 boolean added = false; 454 455 while (rowIndex >= 0 && columnIndex >= 0) { 456 457 if (columnIndex == 0) { 458 dataAtLeft = -1; 459 } else { 460 dataAtLeft = matrix[rowIndex][columnIndex - 1]; 461 } 462 if (rowIndex == 0) { 463 dataAtTop = -1; 464 } else { 465 dataAtTop = matrix[rowIndex - 1][columnIndex]; 466 } 467 if (rowIndex > 0 && columnIndex > 0) { 468 dataAtDiagonal = matrix[rowIndex - 1][columnIndex - 1]; 469 } else { 470 dataAtDiagonal = -1; 471 } 472 if (dataAtLeft == -1 && dataAtTop == -1 && dataAtDiagonal == -1) { 473 break; 474 } 475 data = matrix[rowIndex][columnIndex]; 476 477 // case in which the character at left and right are the same, 478 // in this case none of the counters will be incremented. 479 if (columnIndex > 0 && rowIndex > 0 && left.charAt(columnIndex - 1) == right.charAt(rowIndex - 1)) { 480 columnIndex--; 481 rowIndex--; 482 continue; 483 } 484 485 // handling insert and delete cases. 486 deleted = false; 487 added = false; 488 if (data - 1 == dataAtLeft && (data <= dataAtDiagonal && data <= dataAtTop) 489 || (dataAtDiagonal == -1 && dataAtTop == -1)) { // NOPMD 490 columnIndex--; 491 if (swapped) { 492 addCount++; 493 added = true; 494 } else { 495 delCount++; 496 deleted = true; 497 } 498 } else if (data - 1 == dataAtTop && (data <= dataAtDiagonal && data <= dataAtLeft) 499 || (dataAtDiagonal == -1 && dataAtLeft == -1)) { // NOPMD 500 rowIndex--; 501 if (swapped) { 502 delCount++; 503 deleted = true; 504 } else { 505 addCount++; 506 added = true; 507 } 508 } 509 510 // substituted case 511 if (!added && !deleted) { 512 subCount++; 513 columnIndex--; 514 rowIndex--; 515 } 516 } 517 return new LevenshteinResults(addCount + delCount + subCount, addCount, delCount, subCount); 518 } 519}