1 /**
2  * Compiler implementation of the D programming language
3  * http://dlang.org
4  *
5  * Copyright: Copyright (c) 1999-2016 by Digital Mars, All Rights Reserved
6  * Authors:   Walter Bright, http://www.digitalmars.com
7  * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
8  * Source:    $(DMDSRC root/_speller.d)
9  */
10 
11 module ddmd.root.speller;
12 
13 import core.stdc.limits;
14 import core.stdc.stdlib;
15 import core.stdc..string;
16 
17 alias dg_speller_t = void* delegate(const(char)*, ref int);
18 
19 __gshared const(char)* idchars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_";
20 
21 /**************************************************
22  * combine a new result from the spell checker to
23  * find the one with the closest symbol with
24  * respect to the cost defined by the search function
25  * Input/Output:
26  *      p       best found spelling (NULL if none found yet)
27  *      cost    cost of p (INT_MAX if none found yet)
28  * Input:
29  *      np      new found spelling (NULL if none found)
30  *      ncost   cost of np if non-NULL
31  * Returns:
32  *      true    if the cost is less or equal 0
33  *      false   otherwise
34  */
35 bool combineSpellerResult(ref void* p, ref int cost, void* np, int ncost)
36 {
37     if (np && ncost < cost)
38     {
39         p = np;
40         cost = ncost;
41         if (cost <= 0)
42             return true;
43     }
44     return false;
45 }
46 
47 void* spellerY(const(char)* seed, size_t seedlen, dg_speller_t dg, const(char)* charset, size_t index, int* cost)
48 {
49     if (!seedlen)
50         return null;
51     assert(seed[seedlen] == 0);
52     char[30] tmp;
53     char* buf;
54     if (seedlen <= tmp.sizeof - 2)
55         buf = tmp.ptr;
56     else
57     {
58         buf = cast(char*)alloca(seedlen + 2); // leave space for extra char
59         if (!buf)
60             return null; // no matches
61     }
62     memcpy(buf, seed, index);
63     *cost = INT_MAX;
64     void* p = null;
65     int ncost;
66     /* Delete at seed[index] */
67     if (index < seedlen)
68     {
69         memcpy(buf + index, seed + index + 1, seedlen - index);
70         assert(buf[seedlen - 1] == 0);
71         void* np = dg(buf, ncost);
72         if (combineSpellerResult(p, *cost, np, ncost))
73             return p;
74     }
75     if (charset && *charset)
76     {
77         /* Substitutions */
78         if (index < seedlen)
79         {
80             memcpy(buf, seed, seedlen + 1);
81             for (const(char)* s = charset; *s; s++)
82             {
83                 buf[index] = *s;
84                 //printf("sub buf = '%s'\n", buf);
85                 void* np = dg(buf, ncost);
86                 if (combineSpellerResult(p, *cost, np, ncost))
87                     return p;
88             }
89             assert(buf[seedlen] == 0);
90         }
91         /* Insertions */
92         memcpy(buf + index + 1, seed + index, seedlen + 1 - index);
93         for (const(char)* s = charset; *s; s++)
94         {
95             buf[index] = *s;
96             //printf("ins buf = '%s'\n", buf);
97             void* np = dg(buf, ncost);
98             if (combineSpellerResult(p, *cost, np, ncost))
99                 return p;
100         }
101         assert(buf[seedlen + 1] == 0);
102     }
103     return p; // return "best" result
104 }
105 
106 void* spellerX(const(char)* seed, size_t seedlen, dg_speller_t dg, const(char)* charset, int flag)
107 {
108     if (!seedlen)
109         return null;
110     char[30] tmp;
111     char* buf;
112     if (seedlen <= tmp.sizeof - 2)
113         buf = tmp.ptr;
114     else
115     {
116         buf = cast(char*)alloca(seedlen + 2); // leave space for extra char
117         if (!buf)
118             return null; // no matches
119     }
120     int cost = INT_MAX, ncost;
121     void* p = null, np;
122     /* Deletions */
123     memcpy(buf, seed + 1, seedlen);
124     for (size_t i = 0; i < seedlen; i++)
125     {
126         //printf("del buf = '%s'\n", buf);
127         if (flag)
128             np = spellerY(buf, seedlen - 1, dg, charset, i, &ncost);
129         else
130             np = dg(buf, ncost);
131         if (combineSpellerResult(p, cost, np, ncost))
132             return p;
133         buf[i] = seed[i];
134     }
135     /* Transpositions */
136     if (!flag)
137     {
138         memcpy(buf, seed, seedlen + 1);
139         for (size_t i = 0; i + 1 < seedlen; i++)
140         {
141             // swap [i] and [i + 1]
142             buf[i] = seed[i + 1];
143             buf[i + 1] = seed[i];
144             //printf("tra buf = '%s'\n", buf);
145             if (combineSpellerResult(p, cost, dg(buf, ncost), ncost))
146                 return p;
147             buf[i] = seed[i];
148         }
149     }
150     if (charset && *charset)
151     {
152         /* Substitutions */
153         memcpy(buf, seed, seedlen + 1);
154         for (size_t i = 0; i < seedlen; i++)
155         {
156             for (const(char)* s = charset; *s; s++)
157             {
158                 buf[i] = *s;
159                 //printf("sub buf = '%s'\n", buf);
160                 if (flag)
161                     np = spellerY(buf, seedlen, dg, charset, i + 1, &ncost);
162                 else
163                     np = dg(buf, ncost);
164                 if (combineSpellerResult(p, cost, np, ncost))
165                     return p;
166             }
167             buf[i] = seed[i];
168         }
169         /* Insertions */
170         memcpy(buf + 1, seed, seedlen + 1);
171         for (size_t i = 0; i <= seedlen; i++) // yes, do seedlen+1 iterations
172         {
173             for (const(char)* s = charset; *s; s++)
174             {
175                 buf[i] = *s;
176                 //printf("ins buf = '%s'\n", buf);
177                 if (flag)
178                     np = spellerY(buf, seedlen + 1, dg, charset, i + 1, &ncost);
179                 else
180                     np = dg(buf, ncost);
181                 if (combineSpellerResult(p, cost, np, ncost))
182                     return p;
183             }
184             buf[i] = seed[i]; // going past end of seed[] is ok, as we hit the 0
185         }
186     }
187     return p; // return "best" result
188 }
189 
190 /**************************************************
191  * Looks for correct spelling.
192  * Currently only looks a 'distance' of one from the seed[].
193  * This does an exhaustive search, so can potentially be very slow.
194  * Input:
195  *      seed            wrongly spelled word
196  *      dg              search delegate
197  *      charset         character set
198  * Returns:
199  *      NULL            no correct spellings found
200  *      void*           value returned by dg() for first possible correct spelling
201  */
202 void* speller(const(char)* seed, scope dg_speller_t dg, const(char)* charset)
203 {
204     size_t seedlen = strlen(seed);
205     size_t maxdist = seedlen < 4 ? seedlen / 2 : 2;
206     for (int distance = 0; distance < maxdist; distance++)
207     {
208         void* p = spellerX(seed, seedlen, dg, charset, distance);
209         if (p)
210             return p;
211         //      if (seedlen > 10)
212         //          break;
213     }
214     return null; // didn't find it
215 }
216 
217 unittest
218 {
219     static __gshared const(char)*** cases =
220     [
221         ["hello", "hell", "y"],
222         ["hello", "hel", "y"],
223         ["hello", "ello", "y"],
224         ["hello", "llo", "y"],
225         ["hello", "hellox", "y"],
226         ["hello", "helloxy", "y"],
227         ["hello", "xhello", "y"],
228         ["hello", "xyhello", "y"],
229         ["hello", "ehllo", "y"],
230         ["hello", "helol", "y"],
231         ["hello", "abcd", "n"],
232         ["hello", "helxxlo", "y"],
233         ["hello", "ehlxxlo", "n"],
234         ["hello", "heaao", "y"],
235         ["_123456789_123456789_123456789_123456789", "_123456789_123456789_123456789_12345678", "y"],
236         [null, null, null]
237     ];
238     //printf("unittest_speller()\n");
239 
240     void* dgarg;
241 
242     void* speller_test(const(char)* s, ref int cost)
243     {
244         //printf("speller_test(%s, %s)\n", dgarg, s);
245         cost = 0;
246         if (strcmp(cast(char*)dgarg, s) == 0)
247             return dgarg;
248         return null;
249     }
250 
251     dgarg = cast(char*)"hell";
252     const(void)* p = speller(cast(const(char)*)"hello", &speller_test, idchars);
253     assert(p !is null);
254     for (int i = 0; cases[i][0]; i++)
255     {
256         //printf("case [%d]\n", i);
257         dgarg = cast(void*)cases[i][1];
258         void* p2 = speller(cases[i][0], &speller_test, idchars);
259         if (p2)
260             assert(cases[i][2][0] == 'y');
261         else
262             assert(cases[i][2][0] == 'n');
263     }
264     //printf("unittest_speller() success\n");
265 }