metaphone.c source code [php/ext/standard/metaphone.c]

1	/*
2	+----------------------------------------------------------------------+
3	\| PHP Version 5 \|
4	+----------------------------------------------------------------------+
5	\| Copyright (c) 1997-2015 The PHP Group \|
6	+----------------------------------------------------------------------+
7	\| This source file is subject to version 3.01 of the PHP license, \|
8	\| that is bundled with this package in the file LICENSE, and is \|
9	\| available through the world-wide-web at the following url: \|
10	\| http://www.php.net/license/3_01.txt \|
11	\| If you did not receive a copy of the PHP license and are unable to \|
12	\| obtain it through the world-wide-web, please send a note to \|
13	\| license@php.net so we can mail you a copy immediately. \|
14	+----------------------------------------------------------------------+
15	\| Author: Thies C. Arntzen <thies@thieso.net> \|
16	+----------------------------------------------------------------------+
17	*/
18
19	/ $Id$ /
20
21	/*
22	Based on CPANs "Text-Metaphone-1.96" by Michael G Schwern <schwern@pobox.com>
23	*/
24
25	#include "php.h"
26	#include "php_metaphone.h"
27
28	static int metaphone(unsigned char word, int* word_len, long max_phonemes, char *phoned_word, int* traditional);
29
30	/ {{{ proto string metaphone(string text[, int phones])*
31	Break english phrases down into their phonemes /*
32	PHP_FUNCTION(metaphone)
33	{
34	char *str;
35	char *result = `0`;
36	int str_len;
37	long phones = `0`;
38
39	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s\|l", &str, &str_len,
40	&phones) == FAILURE) {
41	return;
42	}
43
44	if (metaphone((unsigned char *)str, str_len, phones, &result, `1`) == `0`) {
45	RETVAL_STRING(result, `0`);
46	} else {
47	if (result) {
48	efree(result);
49	}
50	RETURN_FALSE;
51	}
52	}
53	/ }}} /
54
55	/*
56	this is now the original code by Michael G Schwern:
57	i've changed it just a slightly bit (use emalloc,
58	get rid of includes etc)
59	- thies - 13.09.1999
60	*/
61
62	/----------------------------- /
63	/ this used to be "metaphone.h" /
64	/----------------------------- /
65
66	/ Special encodings /
67	#define SH 'X'
68	#define TH '0'
69
70	/----------------------------- /
71	/ end of "metaphone.h" /
72	/----------------------------- /
73
74	/----------------------------- /
75	/ this used to be "metachar.h" /
76	/----------------------------- /
77
78	/ Metachar.h ... little bits about characters for metaphone /
79	/-- Character encoding array & accessing macros --/
80	/ Stolen directly out of the book... /
81	char _codes[`26`] =
82	{
83	`1`, `16`, `4`, `16`, `9`, `2`, `4`, `16`, `9`, `2`, `0`, `2`, `2`, `2`, `1`, `4`, `0`, `2`, `4`, `4`, `1`, `0`, `0`, `0`, `8`, `0`
84	/ a b c d e f g h i j k l m n o p q r s t u v w x y z /
85	};
86
87
88	#define ENCODE(c) (isalpha(c) ? _codes[((toupper(c)) - 'A')] : 0)
89
90	#define isvowel(c) (ENCODE(c) & 1) /* AEIOU */
91
92	/ These letters are passed through unchanged /
93	#define NOCHANGE(c) (ENCODE(c) & 2) /* FJMNR */
94
95	/ These form dipthongs when preceding H /
96	#define AFFECTH(c) (ENCODE(c) & 4) /* CGPST */
97
98	/ These make C and G soft /
99	#define MAKESOFT(c) (ENCODE(c) & 8) /* EIY */
100
101	/ These prevent GH from becoming F /
102	#define NOGHTOF(c) (ENCODE(c) & 16) /* BDH */
103
104	/----------------------------- /
105	/ end of "metachar.h" /
106	/----------------------------- /
107
108	/ I suppose I could have been using a character pointer instead of*
109	* accesssing the array directly... */
110
111	/ Look at the next letter in the word /
112	#define Next_Letter (toupper(word[w_idx+1]))
113	/ Look at the current letter in the word /
114	#define Curr_Letter (toupper(word[w_idx]))
115	/ Go N letters back. /
116	#define Look_Back_Letter(n) (w_idx >= n ? toupper(word[w_idx-n]) : '\0')
117	/ Previous letter. I dunno, should this return null on failure? /
118	#define Prev_Letter (Look_Back_Letter(1))
119	/ Look two letters down. It makes sure you don't walk off the string. /
120	#define After_Next_Letter (Next_Letter != '\0' ? toupper(word[w_idx+2]) \
121	: '\0')
122	#define Look_Ahead_Letter(n) (toupper(Lookahead(word+w_idx, n)))
123
124
125	/ Allows us to safely look ahead an arbitrary # of letters /
126	/ I probably could have just used strlen... /
127	static char Lookahead(char word, int* how_far)
128	{
129	char letter_ahead = `'\0'`; / null by default /
130	int idx;
131	for (idx = `0`; word[idx] != `'\0'` && idx < how_far; idx++);
132	/ Edge forward in the string... /
133
134	letter_ahead = word[idx]; / idx will be either == to how_far or*
135	* at the end of the string
136	*/
137	return letter_ahead;
138	}
139
140
141	/ phonize one letter*
142	* We don't know the buffers size in advance. On way to solve this is to just
143	* re-allocate the buffer size. We're using an extra of 2 characters (this
144	* could be one though; or more too). */
145	#define Phonize(c) { \
146	if (p_idx >= max_buffer_len) { \
147	phoned_word = safe_erealloc(phoned_word, 2, sizeof(char), max_buffer_len); \
148	max_buffer_len += 2; \
149	} \
150	(*phoned_word)[p_idx++] = c; \
151	}
152	/ Slap a null character on the end of the phoned word /
153	#define End_Phoned_Word { \
154	if (p_idx == max_buffer_len) { \
155	phoned_word = safe_erealloc(phoned_word, 1, sizeof(char), max_buffer_len); \
156	} \
157	(*phoned_word)[p_idx] = '\0'; \
158	}
159	/ How long is the phoned word? /
160	#define Phone_Len (p_idx)
161
162	/ Note is a letter is a 'break' in the word /
163	#define Isbreak(c) (!isalpha(c))
164
165	/ {{{ metaphone*
166	*/
167	static int metaphone(unsigned char word, int* word_len, long max_phonemes, char *phoned_word, int* traditional)
168	{
169	int w_idx = `0`; / point in the phonization we're at. /
170	int p_idx = `0`; / end of the phoned phrase /
171	int max_buffer_len = `0`; / maximum length of the destination buffer /
172
173	/-- Parameter checks --/
174	/ Negative phoneme length is meaningless /
175
176	if (max_phonemes < `0`)
177	return -`1`;
178
179	/ Empty/null string is meaningless /
180	/ Overly paranoid /
181	/ assert(word != NULL && word[0] != '\0'); /
182
183	if (word == NULL)
184	return -`1`;
185
186	/-- Allocate memory for our phoned_phrase --/
187	if (max_phonemes == `0`) { / Assume largest possible /
188	max_buffer_len = word_len;
189	phoned_word = safe_emalloc(sizeof(char*), word_len, `1`);
190	} else {
191	max_buffer_len = max_phonemes;
192	phoned_word = safe_emalloc(sizeof(char*), max_phonemes, `1`);
193	}
194
195
196	/-- The first phoneme has to be processed specially. --/
197	/ Find our first letter /
198	for (; !isalpha(Curr_Letter); w_idx++) {
199	/ On the off chance we were given nothing but crap... /
200	if (Curr_Letter == `'\0'`) {
201	End_Phoned_Word
202	return SUCCESS; / For testing /
203	}
204	}
205
206	switch (Curr_Letter) {
207	/ AE becomes E /
208	case `'A'`:
209	if (Next_Letter == `'E'`) {
210	Phonize(`'E'`);
211	w_idx += `2`;
212	}
213	/ Remember, preserve vowels at the beginning /
214	else {
215	Phonize(`'A'`);
216	w_idx++;
217	}
218	break;
219	/ [GKP]N becomes N /
220	case `'G'`:
221	case `'K'`:
222	case `'P'`:
223	if (Next_Letter == `'N'`) {
224	Phonize(`'N'`);
225	w_idx += `2`;
226	}
227	break;
228	/ WH becomes W,*
229	WR becomes R
230	W if followed by a vowel /*
231	case `'W'`:
232	if (Next_Letter == `'R'`) {
233	Phonize(Next_Letter);
234	w_idx += `2`;
235	} else if (Next_Letter == `'H'` \|\| isvowel(Next_Letter)) {
236	Phonize(`'W'`);
237	w_idx += `2`;
238	}
239	/ else ignore /
240	break;
241	/ X becomes S /
242	case `'X'`:
243	Phonize(`'S'`);
244	w_idx++;
245	break;
246	/ Vowels are kept /
247	/ We did A already*
248	case 'A':
249	case 'a':
250	*/
251	case `'E'`:
252	case `'I'`:
253	case `'O'`:
254	case `'U'`:
255	Phonize(Curr_Letter);
256	w_idx++;
257	break;
258	default:
259	/ do nothing /
260	break;
261	}
262
263
264
265	/ On to the metaphoning /
266	for (; Curr_Letter != `'\0'` &&
267	(max_phonemes == `0` \|\| Phone_Len < max_phonemes);
268	w_idx++) {
269	/ How many letters to skip because an eariler encoding handled*
270	* multiple letters */
271	unsigned short int skip_letter = `0`;
272
273
274	/ THOUGHT: It would be nice if, rather than having things like...*
275	* well, SCI. For SCI you encode the S, then have to remember
276	* to skip the C. So the phonome SCI invades both S and C. It would
277	* be better, IMHO, to skip the C from the S part of the encoding.
278	* Hell, I'm trying it.
279	*/
280
281	/ Ignore non-alphas /
282	if (!isalpha(Curr_Letter))
283	continue;
284
285	/ Drop duplicates, except CC /
286	if (Curr_Letter == Prev_Letter &&
287	Curr_Letter != `'C'`)
288	continue;
289
290	switch (Curr_Letter) {
291	/ B -> B unless in MB /
292	case `'B'`:
293	if (Prev_Letter != `'M'`)
294	Phonize(`'B'`);
295	break;
296	/ 'sh' if -CIA- or -CH, but not SCH, except SCHW.*
297	* (SCHW is handled in S)
298	* S if -CI-, -CE- or -CY-
299	* dropped if -SCI-, SCE-, -SCY- (handed in S)
300	* else K
301	*/
302	case `'C'`:
303	if (MAKESOFT(Next_Letter)) { / C[IEY] /
304	if (After_Next_Letter == `'A'` &&
305	Next_Letter == `'I'`) { / CIA /
306	Phonize(SH);
307	}
308	/ SC[IEY] /
309	else if (Prev_Letter == `'S'`) {
310	/ Dropped /
311	} else {
312	Phonize(`'S'`);
313	}
314	} else if (Next_Letter == `'H'`) {
315	if ((!traditional) && (After_Next_Letter == `'R'` \|\| Prev_Letter == `'S'`)) { / Christ, School /
316	Phonize(`'K'`);
317	} else {
318	Phonize(SH);
319	}
320	skip_letter++;
321	} else {
322	Phonize(`'K'`);
323	}
324	break;
325	/ J if in -DGE-, -DGI- or -DGY-*
326	* else T
327	*/
328	case `'D'`:
329	if (Next_Letter == `'G'` &&
330	MAKESOFT(After_Next_Letter)) {
331	Phonize(`'J'`);
332	skip_letter++;
333	} else
334	Phonize(`'T'`);
335	break;
336	/ F if in -GH and not B--GH, D--GH, -H--GH, -H---GH*
337	* else dropped if -GNED, -GN,
338	* else dropped if -DGE-, -DGI- or -DGY- (handled in D)
339	* else J if in -GE-, -GI, -GY and not GG
340	* else K
341	*/
342	case `'G'`:
343	if (Next_Letter == `'H'`) {
344	if (!(NOGHTOF(Look_Back_Letter(`3`)) \|\|
345	Look_Back_Letter(`4`) == `'H'`)) {
346	Phonize(`'F'`);
347	skip_letter++;
348	} else {
349	/ silent /
350	}
351	} else if (Next_Letter == `'N'`) {
352	if (Isbreak(After_Next_Letter) \|\|
353	(After_Next_Letter == `'E'` &&
354	Look_Ahead_Letter(`3`) == `'D'`)) {
355	/ dropped /
356	} else
357	Phonize(`'K'`);
358	} else if (MAKESOFT(Next_Letter) &&
359	Prev_Letter != `'G'`) {
360	Phonize(`'J'`);
361	} else {
362	Phonize(`'K'`);
363	}
364	break;
365	/ H if before a vowel and not after C,G,P,S,T /
366	case `'H'`:
367	if (isvowel(Next_Letter) &&
368	!AFFECTH(Prev_Letter))
369	Phonize(`'H'`);
370	break;
371	/ dropped if after C*
372	* else K
373	*/
374	case `'K'`:
375	if (Prev_Letter != `'C'`)
376	Phonize(`'K'`);
377	break;
378	/ F if before H*
379	* else P
380	*/
381	case `'P'`:
382	if (Next_Letter == `'H'`) {
383	Phonize(`'F'`);
384	} else {
385	Phonize(`'P'`);
386	}
387	break;
388	/ K*
389	*/
390	case `'Q'`:
391	Phonize(`'K'`);
392	break;
393	/ 'sh' in -SH-, -SIO- or -SIA- or -SCHW-*
394	* else S
395	*/
396	case `'S'`:
397	if (Next_Letter == `'I'` &&
398	(After_Next_Letter == `'O'` \|\|
399	After_Next_Letter == `'A'`)) {
400	Phonize(SH);
401	} else if (Next_Letter == `'H'`) {
402	Phonize(SH);
403	skip_letter++;
404	} else if ((!traditional) && (Next_Letter == `'C'` && Look_Ahead_Letter(`2`) == `'H'` && Look_Ahead_Letter(`3`) == `'W'`)) {
405	Phonize(SH);
406	skip_letter += `2`;
407	} else {
408	Phonize(`'S'`);
409	}
410	break;
411	/ 'sh' in -TIA- or -TIO-*
412	* else 'th' before H
413	* else T
414	*/
415	case `'T'`:
416	if (Next_Letter == `'I'` &&
417	(After_Next_Letter == `'O'` \|\|
418	After_Next_Letter == `'A'`)) {
419	Phonize(SH);
420	} else if (Next_Letter == `'H'`) {
421	Phonize(TH);
422	skip_letter++;
423	} else if (!(Next_Letter == `'C'` && After_Next_Letter == `'H'`)) {
424	Phonize(`'T'`);
425	}
426	break;
427	/ F /
428	case `'V'`:
429	Phonize(`'F'`);
430	break;
431	/ W before a vowel, else dropped /
432	case `'W'`:
433	if (isvowel(Next_Letter))
434	Phonize(`'W'`);
435	break;
436	/ KS /
437	case `'X'`:
438	Phonize(`'K'`);
439	Phonize(`'S'`);
440	break;
441	/ Y if followed by a vowel /
442	case `'Y'`:
443	if (isvowel(Next_Letter))
444	Phonize(`'Y'`);
445	break;
446	/ S /
447	case `'Z'`:
448	Phonize(`'S'`);
449	break;
450	/ No transformation /
451	case `'F'`:
452	case `'J'`:
453	case `'L'`:
454	case `'M'`:
455	case `'N'`:
456	case `'R'`:
457	Phonize(Curr_Letter);
458	break;
459	default:
460	/ nothing /
461	break;
462	} / END SWITCH /
463
464	w_idx += skip_letter;
465	} / END FOR /
466
467	End_Phoned_Word;
468
469	return `0`;
470	} / END metaphone /
471	/ }}} /
472
473	/*
474	* Local variables:
475	* tab-width: 4
476	* c-basic-offset: 4
477	* End:
478	* vim600: sw=4 ts=4 fdm=marker
479	* vim<600: sw=4 ts=4
480	*/
481

Browse the source code of php/ext/standard/metaphone.c