aboutsummaryrefslogtreecommitdiffstats
path: root/railties/guides/rails_guides/levenshtein.rb
blob: 02e35f60d2bd37c5a4bb412106304170c4024d95 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#
# Levenshtein distance algorithm implementation for Ruby, with UTF-8 support
#
# Author::  Paul BATTLEY (pbattley @ gmail.com)
# Version:: 1.3
# Date::    2005-04-19
#
# == About
#
# The Levenshtein distance is a measure of how similar two strings s and t are,
# calculated as the number of deletions/insertions/substitutions needed to
# transform s into t.  The greater the distance, the more the strings differ.
#
# The Levenshtein distance is also sometimes referred to as the
# easier-to-pronounce-and-spell 'edit distance'.
#
# == Revision history
#
# * 2005-05-19 1.3 Repairing an oversight, distance can now be called via
#   Levenshtein.distance(s, t)
# * 2005-05-04 1.2 Now uses just one 1-dimensional array.  I think this is as
#   far as optimisation can go.
# * 2005-05-04 1.1 Now storing only the current and previous rows of the matrix
#   instead of the whole lot.
#
# == Licence
#
# Copyright (c) 2005 Paul Battley
#
# Usage of the works is permitted provided that this instrument is retained
# with the works, so that any entity that uses the works is notified of this
# instrument.
#
# DISCLAIMER: THE WORKS ARE WITHOUT WARRANTY.  
#

module Levenshtein

    #
    # Calculate the Levenshtein distance between two strings +str1+ and +str2+.
    # +str1+ and +str2+ should be ASCII or UTF-8.
    #
    def distance(str1, str2)
        s = str1.unpack('U*')
        t = str2.unpack('U*')
        n = s.length
        m = t.length
        return m if (0 == n)
        return n if (0 == m)
        
        d = (0..m).to_a
        x = nil

        (0...n).each do |i|
            e = i+1
            (0...m).each do |j|
                cost = (s[i] == t[j]) ? 0 : 1
                x = [
                    d[j+1] + 1, # insertion
                    e + 1,      # deletion
                    d[j] + cost # substitution
                ].min
                d[j] = e
                e = x
            end
            d[m] = x
        end

        return x
    end

    extend self
end

if (__FILE__ == $0)
    require 'test/unit'

    class LevenshteinTest < Test::Unit::TestCase
        include Levenshtein

        EXPECTED = [
            # Easy ones
            ['test', 'test', 0],
            ['test', 'tent', 1],
            ['gumbo', 'gambol', 2],
            ['kitten', 'sitting', 3],
            # Empty strings
            ['foo', '', 3],
            ['', '', 0],
            ['a', '', 1],
            # UTF-8
            ["f\303\266o", 'foo', 1],
            ["fran\303\247ais", 'francais', 1],
            ["fran\303\247ais", "fran\303\246ais", 1],
            ["\347\247\201\343\201\256\345\220\215\345\211\215\343\201\257"<<
             "\343\203\235\343\203\274\343\203\253\343\201\247\343\201\231",
             "\343\201\274\343\201\217\343\201\256\345\220\215\345\211\215\343\201"<<
             "\257\343\203\235\343\203\274\343\203\253\343\201\247\343\201\231", 
             2], # Japanese
            # Edge cases
            ['a', 'a', 0],
            ['0123456789', 'abcdefghijklmnopqrstuvwxyz', 26]
        ]

        def test_known_distances
            EXPECTED.each do |a,b,x|
                assert_equal(x, distance(a, b))
                assert_equal(x, distance(b, a))
            end
        end
    end
end