1 | /* |
2 | +----------------------------------------------------------------------+ |
3 | | PHP Version 5 | |
4 | +----------------------------------------------------------------------+ |
5 | | Copyright (c) 1997-2015 The PHP Group | |
6 | +----------------------------------------------------------------------+ |
7 | | This source file is subject to version 3.01 of the PHP license, | |
8 | | that is bundled with this package in the file LICENSE, and is | |
9 | | available through the world-wide-web at the following url: | |
10 | | http://www.php.net/license/3_01.txt | |
11 | | If you did not receive a copy of the PHP license and are unable to | |
12 | | obtain it through the world-wide-web, please send a note to | |
13 | | license@php.net so we can mail you a copy immediately. | |
14 | +----------------------------------------------------------------------+ |
15 | | Author: Thies C. Arntzen <thies@thieso.net> | |
16 | +----------------------------------------------------------------------+ |
17 | */ |
18 | |
19 | /* $Id$ */ |
20 | |
21 | /* |
22 | Based on CPANs "Text-Metaphone-1.96" by Michael G Schwern <schwern@pobox.com> |
23 | */ |
24 | |
25 | #include "php.h" |
26 | #include "php_metaphone.h" |
27 | |
28 | static int metaphone(unsigned char *word, int word_len, long max_phonemes, char **phoned_word, int traditional); |
29 | |
30 | /* {{{ proto string metaphone(string text[, int phones]) |
31 | Break english phrases down into their phonemes */ |
32 | PHP_FUNCTION(metaphone) |
33 | { |
34 | char *str; |
35 | char *result = 0; |
36 | int str_len; |
37 | long phones = 0; |
38 | |
39 | if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|l" , &str, &str_len, |
40 | &phones) == FAILURE) { |
41 | return; |
42 | } |
43 | |
44 | if (metaphone((unsigned char *)str, str_len, phones, &result, 1) == 0) { |
45 | RETVAL_STRING(result, 0); |
46 | } else { |
47 | if (result) { |
48 | efree(result); |
49 | } |
50 | RETURN_FALSE; |
51 | } |
52 | } |
53 | /* }}} */ |
54 | |
55 | /* |
56 | this is now the original code by Michael G Schwern: |
57 | i've changed it just a slightly bit (use emalloc, |
58 | get rid of includes etc) |
59 | - thies - 13.09.1999 |
60 | */ |
61 | |
62 | /*----------------------------- */ |
63 | /* this used to be "metaphone.h" */ |
64 | /*----------------------------- */ |
65 | |
66 | /* Special encodings */ |
67 | #define SH 'X' |
68 | #define TH '0' |
69 | |
70 | /*----------------------------- */ |
71 | /* end of "metaphone.h" */ |
72 | /*----------------------------- */ |
73 | |
74 | /*----------------------------- */ |
75 | /* this used to be "metachar.h" */ |
76 | /*----------------------------- */ |
77 | |
78 | /* Metachar.h ... little bits about characters for metaphone */ |
79 | /*-- Character encoding array & accessing macros --*/ |
80 | /* Stolen directly out of the book... */ |
81 | char _codes[26] = |
82 | { |
83 | 1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2, 2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0 |
84 | /* a b c d e f g h i j k l m n o p q r s t u v w x y z */ |
85 | }; |
86 | |
87 | |
88 | #define ENCODE(c) (isalpha(c) ? _codes[((toupper(c)) - 'A')] : 0) |
89 | |
90 | #define isvowel(c) (ENCODE(c) & 1) /* AEIOU */ |
91 | |
92 | /* These letters are passed through unchanged */ |
93 | #define NOCHANGE(c) (ENCODE(c) & 2) /* FJMNR */ |
94 | |
95 | /* These form dipthongs when preceding H */ |
96 | #define AFFECTH(c) (ENCODE(c) & 4) /* CGPST */ |
97 | |
98 | /* These make C and G soft */ |
99 | #define MAKESOFT(c) (ENCODE(c) & 8) /* EIY */ |
100 | |
101 | /* These prevent GH from becoming F */ |
102 | #define NOGHTOF(c) (ENCODE(c) & 16) /* BDH */ |
103 | |
104 | /*----------------------------- */ |
105 | /* end of "metachar.h" */ |
106 | /*----------------------------- */ |
107 | |
108 | /* I suppose I could have been using a character pointer instead of |
109 | * accesssing the array directly... */ |
110 | |
111 | /* Look at the next letter in the word */ |
112 | #define Next_Letter (toupper(word[w_idx+1])) |
113 | /* Look at the current letter in the word */ |
114 | #define Curr_Letter (toupper(word[w_idx])) |
115 | /* Go N letters back. */ |
116 | #define Look_Back_Letter(n) (w_idx >= n ? toupper(word[w_idx-n]) : '\0') |
117 | /* Previous letter. I dunno, should this return null on failure? */ |
118 | #define Prev_Letter (Look_Back_Letter(1)) |
119 | /* Look two letters down. It makes sure you don't walk off the string. */ |
120 | #define After_Next_Letter (Next_Letter != '\0' ? toupper(word[w_idx+2]) \ |
121 | : '\0') |
122 | #define Look_Ahead_Letter(n) (toupper(Lookahead(word+w_idx, n))) |
123 | |
124 | |
125 | /* Allows us to safely look ahead an arbitrary # of letters */ |
126 | /* I probably could have just used strlen... */ |
127 | static char Lookahead(char *word, int how_far) |
128 | { |
129 | char letter_ahead = '\0'; /* null by default */ |
130 | int idx; |
131 | for (idx = 0; word[idx] != '\0' && idx < how_far; idx++); |
132 | /* Edge forward in the string... */ |
133 | |
134 | letter_ahead = word[idx]; /* idx will be either == to how_far or |
135 | * at the end of the string |
136 | */ |
137 | return letter_ahead; |
138 | } |
139 | |
140 | |
141 | /* phonize one letter |
142 | * We don't know the buffers size in advance. On way to solve this is to just |
143 | * re-allocate the buffer size. We're using an extra of 2 characters (this |
144 | * could be one though; or more too). */ |
145 | #define Phonize(c) { \ |
146 | if (p_idx >= max_buffer_len) { \ |
147 | *phoned_word = safe_erealloc(*phoned_word, 2, sizeof(char), max_buffer_len); \ |
148 | max_buffer_len += 2; \ |
149 | } \ |
150 | (*phoned_word)[p_idx++] = c; \ |
151 | } |
152 | /* Slap a null character on the end of the phoned word */ |
153 | #define End_Phoned_Word { \ |
154 | if (p_idx == max_buffer_len) { \ |
155 | *phoned_word = safe_erealloc(*phoned_word, 1, sizeof(char), max_buffer_len); \ |
156 | } \ |
157 | (*phoned_word)[p_idx] = '\0'; \ |
158 | } |
159 | /* How long is the phoned word? */ |
160 | #define Phone_Len (p_idx) |
161 | |
162 | /* Note is a letter is a 'break' in the word */ |
163 | #define Isbreak(c) (!isalpha(c)) |
164 | |
165 | /* {{{ metaphone |
166 | */ |
167 | static int metaphone(unsigned char *word, int word_len, long max_phonemes, char **phoned_word, int traditional) |
168 | { |
169 | int w_idx = 0; /* point in the phonization we're at. */ |
170 | int p_idx = 0; /* end of the phoned phrase */ |
171 | int max_buffer_len = 0; /* maximum length of the destination buffer */ |
172 | |
173 | /*-- Parameter checks --*/ |
174 | /* Negative phoneme length is meaningless */ |
175 | |
176 | if (max_phonemes < 0) |
177 | return -1; |
178 | |
179 | /* Empty/null string is meaningless */ |
180 | /* Overly paranoid */ |
181 | /* assert(word != NULL && word[0] != '\0'); */ |
182 | |
183 | if (word == NULL) |
184 | return -1; |
185 | |
186 | /*-- Allocate memory for our phoned_phrase --*/ |
187 | if (max_phonemes == 0) { /* Assume largest possible */ |
188 | max_buffer_len = word_len; |
189 | *phoned_word = safe_emalloc(sizeof(char), word_len, 1); |
190 | } else { |
191 | max_buffer_len = max_phonemes; |
192 | *phoned_word = safe_emalloc(sizeof(char), max_phonemes, 1); |
193 | } |
194 | |
195 | |
196 | /*-- The first phoneme has to be processed specially. --*/ |
197 | /* Find our first letter */ |
198 | for (; !isalpha(Curr_Letter); w_idx++) { |
199 | /* On the off chance we were given nothing but crap... */ |
200 | if (Curr_Letter == '\0') { |
201 | End_Phoned_Word |
202 | return SUCCESS; /* For testing */ |
203 | } |
204 | } |
205 | |
206 | switch (Curr_Letter) { |
207 | /* AE becomes E */ |
208 | case 'A': |
209 | if (Next_Letter == 'E') { |
210 | Phonize('E'); |
211 | w_idx += 2; |
212 | } |
213 | /* Remember, preserve vowels at the beginning */ |
214 | else { |
215 | Phonize('A'); |
216 | w_idx++; |
217 | } |
218 | break; |
219 | /* [GKP]N becomes N */ |
220 | case 'G': |
221 | case 'K': |
222 | case 'P': |
223 | if (Next_Letter == 'N') { |
224 | Phonize('N'); |
225 | w_idx += 2; |
226 | } |
227 | break; |
228 | /* WH becomes W, |
229 | WR becomes R |
230 | W if followed by a vowel */ |
231 | case 'W': |
232 | if (Next_Letter == 'R') { |
233 | Phonize(Next_Letter); |
234 | w_idx += 2; |
235 | } else if (Next_Letter == 'H' || isvowel(Next_Letter)) { |
236 | Phonize('W'); |
237 | w_idx += 2; |
238 | } |
239 | /* else ignore */ |
240 | break; |
241 | /* X becomes S */ |
242 | case 'X': |
243 | Phonize('S'); |
244 | w_idx++; |
245 | break; |
246 | /* Vowels are kept */ |
247 | /* We did A already |
248 | case 'A': |
249 | case 'a': |
250 | */ |
251 | case 'E': |
252 | case 'I': |
253 | case 'O': |
254 | case 'U': |
255 | Phonize(Curr_Letter); |
256 | w_idx++; |
257 | break; |
258 | default: |
259 | /* do nothing */ |
260 | break; |
261 | } |
262 | |
263 | |
264 | |
265 | /* On to the metaphoning */ |
266 | for (; Curr_Letter != '\0' && |
267 | (max_phonemes == 0 || Phone_Len < max_phonemes); |
268 | w_idx++) { |
269 | /* How many letters to skip because an eariler encoding handled |
270 | * multiple letters */ |
271 | unsigned short int skip_letter = 0; |
272 | |
273 | |
274 | /* THOUGHT: It would be nice if, rather than having things like... |
275 | * well, SCI. For SCI you encode the S, then have to remember |
276 | * to skip the C. So the phonome SCI invades both S and C. It would |
277 | * be better, IMHO, to skip the C from the S part of the encoding. |
278 | * Hell, I'm trying it. |
279 | */ |
280 | |
281 | /* Ignore non-alphas */ |
282 | if (!isalpha(Curr_Letter)) |
283 | continue; |
284 | |
285 | /* Drop duplicates, except CC */ |
286 | if (Curr_Letter == Prev_Letter && |
287 | Curr_Letter != 'C') |
288 | continue; |
289 | |
290 | switch (Curr_Letter) { |
291 | /* B -> B unless in MB */ |
292 | case 'B': |
293 | if (Prev_Letter != 'M') |
294 | Phonize('B'); |
295 | break; |
296 | /* 'sh' if -CIA- or -CH, but not SCH, except SCHW. |
297 | * (SCHW is handled in S) |
298 | * S if -CI-, -CE- or -CY- |
299 | * dropped if -SCI-, SCE-, -SCY- (handed in S) |
300 | * else K |
301 | */ |
302 | case 'C': |
303 | if (MAKESOFT(Next_Letter)) { /* C[IEY] */ |
304 | if (After_Next_Letter == 'A' && |
305 | Next_Letter == 'I') { /* CIA */ |
306 | Phonize(SH); |
307 | } |
308 | /* SC[IEY] */ |
309 | else if (Prev_Letter == 'S') { |
310 | /* Dropped */ |
311 | } else { |
312 | Phonize('S'); |
313 | } |
314 | } else if (Next_Letter == 'H') { |
315 | if ((!traditional) && (After_Next_Letter == 'R' || Prev_Letter == 'S')) { /* Christ, School */ |
316 | Phonize('K'); |
317 | } else { |
318 | Phonize(SH); |
319 | } |
320 | skip_letter++; |
321 | } else { |
322 | Phonize('K'); |
323 | } |
324 | break; |
325 | /* J if in -DGE-, -DGI- or -DGY- |
326 | * else T |
327 | */ |
328 | case 'D': |
329 | if (Next_Letter == 'G' && |
330 | MAKESOFT(After_Next_Letter)) { |
331 | Phonize('J'); |
332 | skip_letter++; |
333 | } else |
334 | Phonize('T'); |
335 | break; |
336 | /* F if in -GH and not B--GH, D--GH, -H--GH, -H---GH |
337 | * else dropped if -GNED, -GN, |
338 | * else dropped if -DGE-, -DGI- or -DGY- (handled in D) |
339 | * else J if in -GE-, -GI, -GY and not GG |
340 | * else K |
341 | */ |
342 | case 'G': |
343 | if (Next_Letter == 'H') { |
344 | if (!(NOGHTOF(Look_Back_Letter(3)) || |
345 | Look_Back_Letter(4) == 'H')) { |
346 | Phonize('F'); |
347 | skip_letter++; |
348 | } else { |
349 | /* silent */ |
350 | } |
351 | } else if (Next_Letter == 'N') { |
352 | if (Isbreak(After_Next_Letter) || |
353 | (After_Next_Letter == 'E' && |
354 | Look_Ahead_Letter(3) == 'D')) { |
355 | /* dropped */ |
356 | } else |
357 | Phonize('K'); |
358 | } else if (MAKESOFT(Next_Letter) && |
359 | Prev_Letter != 'G') { |
360 | Phonize('J'); |
361 | } else { |
362 | Phonize('K'); |
363 | } |
364 | break; |
365 | /* H if before a vowel and not after C,G,P,S,T */ |
366 | case 'H': |
367 | if (isvowel(Next_Letter) && |
368 | !AFFECTH(Prev_Letter)) |
369 | Phonize('H'); |
370 | break; |
371 | /* dropped if after C |
372 | * else K |
373 | */ |
374 | case 'K': |
375 | if (Prev_Letter != 'C') |
376 | Phonize('K'); |
377 | break; |
378 | /* F if before H |
379 | * else P |
380 | */ |
381 | case 'P': |
382 | if (Next_Letter == 'H') { |
383 | Phonize('F'); |
384 | } else { |
385 | Phonize('P'); |
386 | } |
387 | break; |
388 | /* K |
389 | */ |
390 | case 'Q': |
391 | Phonize('K'); |
392 | break; |
393 | /* 'sh' in -SH-, -SIO- or -SIA- or -SCHW- |
394 | * else S |
395 | */ |
396 | case 'S': |
397 | if (Next_Letter == 'I' && |
398 | (After_Next_Letter == 'O' || |
399 | After_Next_Letter == 'A')) { |
400 | Phonize(SH); |
401 | } else if (Next_Letter == 'H') { |
402 | Phonize(SH); |
403 | skip_letter++; |
404 | } else if ((!traditional) && (Next_Letter == 'C' && Look_Ahead_Letter(2) == 'H' && Look_Ahead_Letter(3) == 'W')) { |
405 | Phonize(SH); |
406 | skip_letter += 2; |
407 | } else { |
408 | Phonize('S'); |
409 | } |
410 | break; |
411 | /* 'sh' in -TIA- or -TIO- |
412 | * else 'th' before H |
413 | * else T |
414 | */ |
415 | case 'T': |
416 | if (Next_Letter == 'I' && |
417 | (After_Next_Letter == 'O' || |
418 | After_Next_Letter == 'A')) { |
419 | Phonize(SH); |
420 | } else if (Next_Letter == 'H') { |
421 | Phonize(TH); |
422 | skip_letter++; |
423 | } else if (!(Next_Letter == 'C' && After_Next_Letter == 'H')) { |
424 | Phonize('T'); |
425 | } |
426 | break; |
427 | /* F */ |
428 | case 'V': |
429 | Phonize('F'); |
430 | break; |
431 | /* W before a vowel, else dropped */ |
432 | case 'W': |
433 | if (isvowel(Next_Letter)) |
434 | Phonize('W'); |
435 | break; |
436 | /* KS */ |
437 | case 'X': |
438 | Phonize('K'); |
439 | Phonize('S'); |
440 | break; |
441 | /* Y if followed by a vowel */ |
442 | case 'Y': |
443 | if (isvowel(Next_Letter)) |
444 | Phonize('Y'); |
445 | break; |
446 | /* S */ |
447 | case 'Z': |
448 | Phonize('S'); |
449 | break; |
450 | /* No transformation */ |
451 | case 'F': |
452 | case 'J': |
453 | case 'L': |
454 | case 'M': |
455 | case 'N': |
456 | case 'R': |
457 | Phonize(Curr_Letter); |
458 | break; |
459 | default: |
460 | /* nothing */ |
461 | break; |
462 | } /* END SWITCH */ |
463 | |
464 | w_idx += skip_letter; |
465 | } /* END FOR */ |
466 | |
467 | End_Phoned_Word; |
468 | |
469 | return 0; |
470 | } /* END metaphone */ |
471 | /* }}} */ |
472 | |
473 | /* |
474 | * Local variables: |
475 | * tab-width: 4 |
476 | * c-basic-offset: 4 |
477 | * End: |
478 | * vim600: sw=4 ts=4 fdm=marker |
479 | * vim<600: sw=4 ts=4 |
480 | */ |
481 | |