html.c source code [php/ext/standard/html.c]

1	/*
2	+----------------------------------------------------------------------+
3	\| PHP Version 5 \|
4	+----------------------------------------------------------------------+
5	\| Copyright (c) 1997-2015 The PHP Group \|
6	+----------------------------------------------------------------------+
7	\| This source file is subject to version 3.01 of the PHP license, \|
8	\| that is bundled with this package in the file LICENSE, and is \|
9	\| available through the world-wide-web at the following url: \|
10	\| http://www.php.net/license/3_01.txt \|
11	\| If you did not receive a copy of the PHP license and are unable to \|
12	\| obtain it through the world-wide-web, please send a note to \|
13	\| license@php.net so we can mail you a copy immediately. \|
14	+----------------------------------------------------------------------+
15	\| Authors: Rasmus Lerdorf <rasmus@php.net> \|
16	\| Jaakko Hyvätti <jaakko.hyvatti@iki.fi> \|
17	\| Wez Furlong <wez@thebrainroom.com> \|
18	\| Gustavo Lopes <cataphract@php.net> \|
19	+----------------------------------------------------------------------+
20	*/
21
22	/ $Id$ /
23
24	/*
25	* HTML entity resources:
26	*
27	* http://www.unicode.org/Public/MAPPINGS/OBSOLETE/UNI2SGML.TXT
28	*
29	* XHTML 1.0 DTD
30	* http://www.w3.org/TR/2002/REC-xhtml1-20020801/dtds.html#h-A2
31	*
32	* From HTML 4.01 strict DTD:
33	* http://www.w3.org/TR/html4/HTMLlat1.ent
34	* http://www.w3.org/TR/html4/HTMLsymbol.ent
35	* http://www.w3.org/TR/html4/HTMLspecial.ent
36	*
37	* HTML 5:
38	* http://dev.w3.org/html5/spec/Overview.html#named-character-references
39	*/
40
41	#include "php.h"
42	#if PHP_WIN32
43	#include "config.w32.h"
44	#else
45	#include <php_config.h>
46	#endif
47	#include "php_standard.h"
48	#include "php_string.h"
49	#include "SAPI.h"
50	#if HAVE_LOCALE_H
51	#include <locale.h>
52	#endif
53	#if HAVE_LANGINFO_H
54	#include <langinfo.h>
55	#endif
56
57	#include <zend_hash.h>
58	#include "html_tables.h"
59
60	/ Macro for disabling flag of translation of non-basic entities where this isn't supported.*
61	* Not appropriate for html_entity_decode/htmlspecialchars_decode */
62	#define LIMIT_ALL(all, doctype, charset) do { \
63	(all) = (all) && !CHARSET_PARTIAL_SUPPORT((charset)) && ((doctype) != ENT_HTML_DOC_XML1); \
64	} while (0)
65
66	#define MB_FAILURE(pos, advance) do { \
67	*cursor = pos + (advance); \
68	*status = FAILURE; \
69	return 0; \
70	} while (0)
71
72	#define CHECK_LEN(pos, chars_need) ((str_len - (pos)) >= (chars_need))
73
74	/ valid as single byte character or leading byte /
75	#define utf8_lead(c) ((c) < 0x80 \|\| ((c) >= 0xC2 && (c) <= 0xF4))
76	/ whether it's actually valid depends on other stuff;*
77	* this macro cannot check for non-shortest forms, surrogates or
78	* code points above 0x10FFFF */
79	#define utf8_trail(c) ((c) >= 0x80 && (c) <= 0xBF)
80
81	#define gb2312_lead(c) ((c) != 0x8E && (c) != 0x8F && (c) != 0xA0 && (c) != 0xFF)
82	#define gb2312_trail(c) ((c) >= 0xA1 && (c) <= 0xFE)
83
84	#define sjis_lead(c) ((c) != 0x80 && (c) != 0xA0 && (c) < 0xFD)
85	#define sjis_trail(c) ((c) >= 0x40 && (c) != 0x7F && (c) < 0xFD)
86
87	/ {{{ get_default_charset*
88	*/
89	static char *get_default_charset(TSRMLS_D) {
90	if (PG(internal_encoding) && PG(internal_encoding)[`0`]) {
91	return PG(internal_encoding);
92	} else if (SG(default_charset) && SG(default_charset)[`0`] ) {
93	return SG(default_charset);
94	}
95	return NULL;
96	}
97	/ }}} /
98
99	/ {{{ get_next_char*
100	*/
101	static inline unsigned int get_next_char(
102	enum entity_charset charset,
103	const unsigned char *str,
104	size_t str_len,
105	size_t *cursor,
106	int *status)
107	{
108	size_t pos = *cursor;
109	unsigned int this_char = `0`;
110
111	*status = SUCCESS;
112	assert(pos <= str_len);
113
114	if (!CHECK_LEN(pos, `1`))
115	MB_FAILURE(pos, `1`);
116
117	switch (charset) {
118	case cs_utf_8:
119	{
120	/ We'll follow strategy 2. from section 3.6.1 of UTR #36:*
121	* "In a reported illegal byte sequence, do not include any
122	* non-initial byte that encodes a valid character or is a leading
123	* byte for a valid sequence." */
124	unsigned char c;
125	c = str[pos];
126	if (c < `0x80`) {
127	this_char = c;
128	pos++;
129	} else if (c < `0xc2`) {
130	MB_FAILURE(pos, `1`);
131	} else if (c < `0xe0`) {
132	if (!CHECK_LEN(pos, `2`))
133	MB_FAILURE(pos, `1`);
134
135	if (!utf8_trail(str[pos + `1`])) {
136	MB_FAILURE(pos, utf8_lead(str[pos + `1`]) ? `1` : `2`);
137	}
138	this_char = ((c & `0x1f`) << `6`) \| (str[pos + `1`] & `0x3f`);
139	if (this_char < `0x80`) { / non-shortest form /
140	MB_FAILURE(pos, `2`);
141	}
142	pos += `2`;
143	} else if (c < `0xf0`) {
144	size_t avail = str_len - pos;
145
146	if (avail < `3` \|\|
147	!utf8_trail(str[pos + `1`]) \|\| !utf8_trail(str[pos + `2`])) {
148	if (avail < `2` \|\| utf8_lead(str[pos + `1`]))
149	MB_FAILURE(pos, `1`);
150	else if (avail < `3` \|\| utf8_lead(str[pos + `2`]))
151	MB_FAILURE(pos, `2`);
152	else
153	MB_FAILURE(pos, `3`);
154	}
155
156	this_char = ((c & `0x0f`) << `12`) \| ((str[pos + `1`] & `0x3f`) << `6`) \| (str[pos + `2`] & `0x3f`);
157	if (this_char < `0x800`) { / non-shortest form /
158	MB_FAILURE(pos, `3`);
159	} else if (this_char >= `0xd800` && this_char <= `0xdfff`) { / surrogate /
160	MB_FAILURE(pos, `3`);
161	}
162	pos += `3`;
163	} else if (c < `0xf5`) {
164	size_t avail = str_len - pos;
165
166	if (avail < `4` \|\|
167	!utf8_trail(str[pos + `1`]) \|\| !utf8_trail(str[pos + `2`]) \|\|
168	!utf8_trail(str[pos + `3`])) {
169	if (avail < `2` \|\| utf8_lead(str[pos + `1`]))
170	MB_FAILURE(pos, `1`);
171	else if (avail < `3` \|\| utf8_lead(str[pos + `2`]))
172	MB_FAILURE(pos, `2`);
173	else if (avail < `4` \|\| utf8_lead(str[pos + `3`]))
174	MB_FAILURE(pos, `3`);
175	else
176	MB_FAILURE(pos, `4`);
177	}
178
179	this_char = ((c & `0x07`) << `18`) \| ((str[pos + `1`] & `0x3f`) << `12`) \| ((str[pos + `2`] & `0x3f`) << `6`) \| (str[pos + `3`] & `0x3f`);
180	if (this_char < `0x10000` \|\| this_char > `0x10FFFF`) { / non-shortest form or outside range /
181	MB_FAILURE(pos, `4`);
182	}
183	pos += `4`;
184	} else {
185	MB_FAILURE(pos, `1`);
186	}
187	}
188	break;
189
190	case cs_big5:
191	/ reference http://demo.icu-project.org/icu-bin/convexp?conv=big5 /
192	{
193	unsigned char c = str[pos];
194	if (c >= `0x81` && c <= `0xFE`) {
195	unsigned char next;
196	if (!CHECK_LEN(pos, `2`))
197	MB_FAILURE(pos, `1`);
198
199	next = str[pos + `1`];
200
201	if ((next >= `0x40` && next <= `0x7E`) \|\|
202	(next >= `0xA1` && next <= `0xFE`)) {
203	this_char = (c << `8`) \| next;
204	} else {
205	MB_FAILURE(pos, `1`);
206	}
207	pos += `2`;
208	} else {
209	this_char = c;
210	pos += `1`;
211	}
212	}
213	break;
214
215	case cs_big5hkscs:
216	{
217	unsigned char c = str[pos];
218	if (c >= `0x81` && c <= `0xFE`) {
219	unsigned char next;
220	if (!CHECK_LEN(pos, `2`))
221	MB_FAILURE(pos, `1`);
222
223	next = str[pos + `1`];
224
225	if ((next >= `0x40` && next <= `0x7E`) \|\|
226	(next >= `0xA1` && next <= `0xFE`)) {
227	this_char = (c << `8`) \| next;
228	} else if (next != `0x80` && next != `0xFF`) {
229	MB_FAILURE(pos, `1`);
230	} else {
231	MB_FAILURE(pos, `2`);
232	}
233	pos += `2`;
234	} else {
235	this_char = c;
236	pos += `1`;
237	}
238	}
239	break;
240
241	case cs_gb2312: / EUC-CN /
242	{
243	unsigned char c = str[pos];
244	if (c >= `0xA1` && c <= `0xFE`) {
245	unsigned char next;
246	if (!CHECK_LEN(pos, `2`))
247	MB_FAILURE(pos, `1`);
248
249	next = str[pos + `1`];
250
251	if (gb2312_trail(next)) {
252	this_char = (c << `8`) \| next;
253	} else if (gb2312_lead(next)) {
254	MB_FAILURE(pos, `1`);
255	} else {
256	MB_FAILURE(pos, `2`);
257	}
258	pos += `2`;
259	} else if (gb2312_lead(c)) {
260	this_char = c;
261	pos += `1`;
262	} else {
263	MB_FAILURE(pos, `1`);
264	}
265	}
266	break;
267
268	case cs_sjis:
269	{
270	unsigned char c = str[pos];
271	if ((c >= `0x81` && c <= `0x9F`) \|\| (c >= `0xE0` && c <= `0xFC`)) {
272	unsigned char next;
273	if (!CHECK_LEN(pos, `2`))
274	MB_FAILURE(pos, `1`);
275
276	next = str[pos + `1`];
277
278	if (sjis_trail(next)) {
279	this_char = (c << `8`) \| next;
280	} else if (sjis_lead(next)) {
281	MB_FAILURE(pos, `1`);
282	} else {
283	MB_FAILURE(pos, `2`);
284	}
285	pos += `2`;
286	} else if (c < `0x80` \|\| (c >= `0xA1` && c <= `0xDF`)) {
287	this_char = c;
288	pos += `1`;
289	} else {
290	MB_FAILURE(pos, `1`);
291	}
292	}
293	break;
294
295	case cs_eucjp:
296	{
297	unsigned char c = str[pos];
298
299	if (c >= `0xA1` && c <= `0xFE`) {
300	unsigned next;
301	if (!CHECK_LEN(pos, `2`))
302	MB_FAILURE(pos, `1`);
303	next = str[pos + `1`];
304
305	if (next >= `0xA1` && next <= `0xFE`) {
306	/ this a jis kanji char /
307	this_char = (c << `8`) \| next;
308	} else {
309	MB_FAILURE(pos, (next != `0xA0` && next != `0xFF`) ? `1` : `2`);
310	}
311	pos += `2`;
312	} else if (c == `0x8E`) {
313	unsigned next;
314	if (!CHECK_LEN(pos, `2`))
315	MB_FAILURE(pos, `1`);
316
317	next = str[pos + `1`];
318	if (next >= `0xA1` && next <= `0xDF`) {
319	/ JIS X 0201 kana /
320	this_char = (c << `8`) \| next;
321	} else {
322	MB_FAILURE(pos, (next != `0xA0` && next != `0xFF`) ? `1` : `2`);
323	}
324	pos += `2`;
325	} else if (c == `0x8F`) {
326	size_t avail = str_len - pos;
327
328	if (avail < `3` \|\| !(str[pos + `1`] >= `0xA1` && str[pos + `1`] <= `0xFE`) \|\|
329	!(str[pos + `2`] >= `0xA1` && str[pos + `2`] <= `0xFE`)) {
330	if (avail < `2` \|\| (str[pos + `1`] != `0xA0` && str[pos + `1`] != `0xFF`))
331	MB_FAILURE(pos, `1`);
332	else if (avail < `3` \|\| (str[pos + `2`] != `0xA0` && str[pos + `2`] != `0xFF`))
333	MB_FAILURE(pos, `2`);
334	else
335	MB_FAILURE(pos, `3`);
336	} else {
337	/ JIS X 0212 hojo-kanji /
338	this_char = (c << `16`) \| (str[pos + `1`] << `8`) \| str[pos + `2`];
339	}
340	pos += `3`;
341	} else if (c != `0xA0` && c != `0xFF`) {
342	/ character encoded in 1 code unit /
343	this_char = c;
344	pos += `1`;
345	} else {
346	MB_FAILURE(pos, `1`);
347	}
348	}
349	break;
350	default:
351	/ single-byte charsets /
352	this_char = str[pos++];
353	break;
354	}
355
356	*cursor = pos;
357	return this_char;
358	}
359	/ }}} /
360
361	/ {{{ php_next_utf8_char*
362	* Public interface for get_next_char used with UTF-8 */
363	PHPAPI unsigned int php_next_utf8_char(
364	const unsigned char *str,
365	size_t str_len,
366	size_t *cursor,
367	int *status)
368	{
369	return get_next_char(cs_utf_8, str, str_len, cursor, status);
370	}
371	/ }}} /
372
373	/ {{{ entity_charset determine_charset*
374	* returns the charset identifier based on current locale or a hint.
375	* defaults to UTF-8 */
376	static enum entity_charset determine_charset(char *charset_hint TSRMLS_DC)
377	{
378	int i;
379	enum entity_charset charset = cs_utf_8;
380	int len = `0`;
381	const zend_encoding *zenc;
382
383	/ Default is now UTF-8 /
384	if (charset_hint == NULL)
385	return cs_utf_8;
386
387	if ((len = strlen(charset_hint)) != `0`) {
388	goto det_charset;
389	}
390
391	zenc = zend_multibyte_get_internal_encoding(TSRMLS_C);
392	if (zenc != NULL) {
393	charset_hint = (char *)zend_multibyte_get_encoding_name(zenc);
394	if (charset_hint != NULL && (len=strlen(charset_hint)) != `0`) {
395	if ((len == `4`) / sizeof (none\|auto\|pass) / &&
396	(!memcmp("pass", charset_hint, `4`) \|\|
397	!memcmp("auto", charset_hint, `4`) \|\|
398	!memcmp("none", charset_hint, `4`))) {
399	charset_hint = NULL;
400	len = `0`;
401	} else {
402	goto det_charset;
403	}
404	}
405	}
406
407	charset_hint = SG(default_charset);
408	if (charset_hint != NULL && (len=strlen(charset_hint)) != `0`) {
409	goto det_charset;
410	}
411
412	/ try to detect the charset for the locale /
413	#if HAVE_NL_LANGINFO && HAVE_LOCALE_H && defined(CODESET)
414	charset_hint = nl_langinfo(CODESET);
415	if (charset_hint != NULL && (len=strlen(charset_hint)) != `0`) {
416	goto det_charset;
417	}
418	#endif
419
420	#if HAVE_LOCALE_H
421	/ try to figure out the charset from the locale /
422	{
423	char *localename;
424	char dot, at;
425
426	/ lang[_territory][.codeset][@modifier] /
427	localename = setlocale(LC_CTYPE, NULL);
428
429	dot = strchr(localename, `'.'`);
430	if (dot) {
431	dot++;
432	/ locale specifies a codeset /
433	at = strchr(dot, `'@'`);
434	if (at)
435	len = at - dot;
436	else
437	len = strlen(dot);
438	charset_hint = dot;
439	} else {
440	/ no explicit name; see if the name itself*
441	* is the charset */
442	charset_hint = localename;
443	len = strlen(charset_hint);
444	}
445	}
446	#endif
447
448	det_charset:
449
450	if (charset_hint) {
451	int found = `0`;
452
453	/ now walk the charset map and look for the codeset /
454	for (i = `0`; charset_map[i].codeset; i++) {
455	if (len == strlen(charset_map[i].codeset) && strncasecmp(charset_hint, charset_map[i].codeset, len) == `0`) {
456	charset = charset_map[i].charset;
457	found = `1`;
458	break;
459	}
460	}
461	if (!found) {
462	php_error_docref(NULL TSRMLS_CC, E_WARNING, "charset `%s' not supported, assuming utf-8",
463	charset_hint);
464	}
465	}
466	return charset;
467	}
468	/ }}} /
469
470	/ {{{ php_utf32_utf8 /
471	static inline size_t php_utf32_utf8(unsigned char buf, unsigned* k)
472	{
473	size_t retval = `0`;
474
475	/ assert(0x0 <= k <= 0x10FFFF); /
476
477	if (k < `0x80`) {
478	buf[`0`] = k;
479	retval = `1`;
480	} else if (k < `0x800`) {
481	buf[`0`] = `0xc0` \| (k >> `6`);
482	buf[`1`] = `0x80` \| (k & `0x3f`);
483	retval = `2`;
484	} else if (k < `0x10000`) {
485	buf[`0`] = `0xe0` \| (k >> `12`);
486	buf[`1`] = `0x80` \| ((k >> `6`) & `0x3f`);
487	buf[`2`] = `0x80` \| (k & `0x3f`);
488	retval = `3`;
489	} else {
490	buf[`0`] = `0xf0` \| (k >> `18`);
491	buf[`1`] = `0x80` \| ((k >> `12`) & `0x3f`);
492	buf[`2`] = `0x80` \| ((k >> `6`) & `0x3f`);
493	buf[`3`] = `0x80` \| (k & `0x3f`);
494	retval = `4`;
495	}
496	/ UTF-8 has been restricted to max 4 bytes since RFC 3629 /
497
498	return retval;
499	}
500	/ }}} /
501
502	/ {{{ php_mb2_int_to_char*
503	* Convert back big endian int representation of sequence of one or two 8-bit code units. */
504	static inline size_t php_mb2_int_to_char(unsigned char buf, unsigned* k)
505	{
506	assert(k <= `0xFFFFU`);
507	/ one or two bytes /
508	if (k <= `0xFFU`) { / 1 /
509	buf[`0`] = k;
510	return `1U`;
511	} else { / 2 /
512	buf[`0`] = k >> `8`;
513	buf[`1`] = k & `0xFFU`;
514	return `2U`;
515	}
516	}
517	/ }}} /
518
519	/ {{{ php_mb3_int_to_char*
520	* Convert back big endian int representation of sequence of one to three 8-bit code units.
521	* For EUC-JP. */
522	static inline size_t php_mb3_int_to_char(unsigned char buf, unsigned* k)
523	{
524	assert(k <= `0xFFFFFFU`);
525	/ one to three bytes /
526	if (k <= `0xFFU`) { / 1 /
527	buf[`0`] = k;
528	return `1U`;
529	} else if (k <= `0xFFFFU`) { / 2 /
530	buf[`0`] = k >> `8`;
531	buf[`1`] = k & `0xFFU`;
532	return `2U`;
533	} else {
534	buf[`0`] = k >> `16`;
535	buf[`1`] = (k >> `8`) & `0xFFU`;
536	buf[`2`] = k & `0xFFU`;
537	return `3U`;
538	}
539	}
540	/ }}} /
541
542
543	/ {{{ unimap_bsearc_cmp*
544	* Binary search of unicode code points in unicode <--> charset mapping.
545	* Returns the code point in the target charset (whose mapping table was given) or 0 if
546	* the unicode code point is not in the table.
547	*/
548	static inline unsigned char unimap_bsearch(const uni_to_enc table, unsigned* code_key_a, size_t num)
549	{
550	const uni_to_enc *l = table,
551	*h = &table[num-`1`],
552	*m;
553	unsigned short code_key;
554
555	/ we have no mappings outside the BMP /
556	if (code_key_a > `0xFFFFU`)
557	return `0`;
558
559	code_key = (unsigned short) code_key_a;
560
561	while (l <= h) {
562	m = l + (h - l) / `2`;
563	if (code_key < m->un_code_point)
564	h = m - `1`;
565	else if (code_key > m->un_code_point)
566	l = m + `1`;
567	else
568	return m->cs_code;
569	}
570	return `0`;
571	}
572	/ }}} /
573
574	/ {{{ map_from_unicode /
575	static inline int map_from_unicode(unsigned code, enum entity_charset charset, unsigned *res)
576	{
577	unsigned char found;
578	const uni_to_enc *table;
579	size_t table_size;
580
581	switch (charset) {
582	case cs_8859_1:
583	/ identity mapping of code points to unicode /
584	if (code > `0xFF`) {
585	return FAILURE;
586	}
587	*res = code;
588	break;
589
590	case cs_8859_5:
591	if (code <= `0xA0` \|\| code == `0xAD` / soft hyphen /) {
592	*res = code;
593	} else if (code == `0x2116`) {
594	res = `0xF0`; /* numero sign /
595	} else if (code == `0xA7`) {
596	res = `0xFD`; /* section sign /
597	} else if (code >= `0x0401` && code <= `0x044F`) {
598	if (code == `0x040D` \|\| code == `0x0450` \|\| code == `0x045D`)
599	return FAILURE;
600	*res = code - `0x360`;
601	} else {
602	return FAILURE;
603	}
604	break;
605
606	case cs_8859_15:
607	if (code < `0xA4` \|\| (code > `0xBE` && code <= `0xFF`)) {
608	*res = code;
609	} else { / between A4 and 0xBE /
610	found = unimap_bsearch(unimap_iso885915,
611	code, sizeof(unimap_iso885915) / sizeof(*unimap_iso885915));
612	if (found)
613	*res = found;
614	else
615	return FAILURE;
616	}
617	break;
618
619	case cs_cp1252:
620	if (code <= `0x7F` \|\| (code >= `0xA0` && code <= `0xFF`)) {
621	*res = code;
622	} else {
623	found = unimap_bsearch(unimap_win1252,
624	code, sizeof(unimap_win1252) / sizeof(*unimap_win1252));
625	if (found)
626	*res = found;
627	else
628	return FAILURE;
629	}
630	break;
631
632	case cs_macroman:
633	if (code == `0x7F`)
634	return FAILURE;
635	table = unimap_macroman;
636	table_size = sizeof(unimap_macroman) / sizeof(*unimap_macroman);
637	goto table_over_7F;
638	case cs_cp1251:
639	table = unimap_win1251;
640	table_size = sizeof(unimap_win1251) / sizeof(*unimap_win1251);
641	goto table_over_7F;
642	case cs_koi8r:
643	table = unimap_koi8r;
644	table_size = sizeof(unimap_koi8r) / sizeof(*unimap_koi8r);
645	goto table_over_7F;
646	case cs_cp866:
647	table = unimap_cp866;
648	table_size = sizeof(unimap_cp866) / sizeof(*unimap_cp866);
649
650	table_over_7F:
651	if (code <= `0x7F`) {
652	*res = code;
653	} else {
654	found = unimap_bsearch(table, code, table_size);
655	if (found)
656	*res = found;
657	else
658	return FAILURE;
659	}
660	break;
661
662	/ from here on, only map the possible characters in the ASCII range.*
663	* to improve support here, it's a matter of building the unicode mappings.
664	* See <http://www.unicode.org/Public/6.0.0/ucd/Unihan.zip> */
665	case cs_sjis:
666	case cs_eucjp:
667	/ we interpret 0x5C as the Yen symbol. This is not universal.*
668	* See <http://www.w3.org/Submission/japanese-xml/#ambiguity_of_yen> */
669	if (code >= `0x20` && code <= `0x7D`) {
670	if (code == `0x5C`)
671	return FAILURE;
672	*res = code;
673	} else {
674	return FAILURE;
675	}
676	break;
677
678	case cs_big5:
679	case cs_big5hkscs:
680	case cs_gb2312:
681	if (code >= `0x20` && code <= `0x7D`) {
682	*res = code;
683	} else {
684	return FAILURE;
685	}
686	break;
687
688	default:
689	return FAILURE;
690	}
691
692	return SUCCESS;
693	}
694	/ }}} /
695
696	/ {{{ /
697	static inline void map_to_unicode(unsigned code, const enc_to_uni table, unsigned* *res)
698	{
699	/ only single byte encodings are currently supported; assumed code <= 0xFF /
700	*res = table->inner[ENT_ENC_TO_UNI_STAGE1(code)]->uni_cp[ENT_ENC_TO_UNI_STAGE2(code)];
701	}
702	/ }}} /
703
704	/ {{{ unicode_cp_is_allowed /
705	static inline int unicode_cp_is_allowed(unsigned uni_cp, int document_type)
706	{
707	/ XML 1.0 HTML 4.01 HTML 5*
708	* 0x09..0x0A 0x09..0x0A 0x09..0x0A
709	* 0x0D 0x0D 0x0C..0x0D
710	* 0x0020..0xD7FF 0x20..0x7E 0x20..0x7E
711	* 0x00A0..0xD7FF 0x00A0..0xD7FF
712	* 0xE000..0xFFFD 0xE000..0x10FFFF 0xE000..0xFDCF
713	* 0x010000..0x10FFFF 0xFDF0..0x10FFFF (*)
714	*
715	* (*) exclude code points where ((code & 0xFFFF) >= 0xFFFE)
716	*
717	* References:
718	* XML 1.0: <http://www.w3.org/TR/REC-xml/#charsets>
719	* HTML 4.01: <http://www.w3.org/TR/1999/PR-html40-19990824/sgml/sgmldecl.html>
720	* HTML 5: <http://dev.w3.org/html5/spec/Overview.html#preprocessing-the-input-stream>
721	*
722	* Not sure this is the relevant part for HTML 5, though. I opted to
723	* disallow the characters that would result in a parse error when
724	* preprocessing of the input stream. See also section 8.1.3.
725	*
726	* It's unclear if XHTML 1.0 allows C1 characters. I'll opt to apply to
727	* XHTML 1.0 the same rules as for XML 1.0.
728	* See <http://cmsmcq.com/2007/C1.xml>.
729	*/
730
731	switch (document_type) {
732	case ENT_HTML_DOC_HTML401:
733	return (uni_cp >= `0x20` && uni_cp <= `0x7E`) \|\|
734	(uni_cp == `0x0A` \|\| uni_cp == `0x09` \|\| uni_cp == `0x0D`) \|\|
735	(uni_cp >= `0xA0` && uni_cp <= `0xD7FF`) \|\|
736	(uni_cp >= `0xE000` && uni_cp <= `0x10FFFF`);
737	case ENT_HTML_DOC_HTML5:
738	return (uni_cp >= `0x20` && uni_cp <= `0x7E`) \|\|
739	(uni_cp >= `0x09` && uni_cp <= `0x0D` && uni_cp != `0x0B`) \|\| / form feed U+0C allowed /
740	(uni_cp >= `0xA0` && uni_cp <= `0xD7FF`) \|\|
741	(uni_cp >= `0xE000` && uni_cp <= `0x10FFFF` &&
742	((uni_cp & `0xFFFF`) < `0xFFFE`) && / last two of each plane (nonchars) disallowed /
743	(uni_cp < `0xFDD0` \|\| uni_cp > `0xFDEF`)); / U+FDD0-U+FDEF (nonchars) disallowed /
744	case ENT_HTML_DOC_XHTML:
745	case ENT_HTML_DOC_XML1:
746	return (uni_cp >= `0x20` && uni_cp <= `0xD7FF`) \|\|
747	(uni_cp == `0x0A` \|\| uni_cp == `0x09` \|\| uni_cp == `0x0D`) \|\|
748	(uni_cp >= `0xE000` && uni_cp <= `0x10FFFF` && uni_cp != `0xFFFE` && uni_cp != `0xFFFF`);
749	default:
750	return `1`;
751	}
752	}
753	/ }}} /
754
755	/ {{{ unicode_cp_is_allowed /
756	static inline int numeric_entity_is_allowed(unsigned uni_cp, int document_type)
757	{
758	/ less restrictive than unicode_cp_is_allowed /
759	switch (document_type) {
760	case ENT_HTML_DOC_HTML401:
761	/ all non-SGML characters (those marked with UNUSED in DESCSET) should be*
762	* representable with numeric entities */
763	return uni_cp <= `0x10FFFF`;
764	case ENT_HTML_DOC_HTML5:
765	/ 8.1.4. The numeric character reference forms described above are allowed to*
766	* reference any Unicode code point other than U+0000, U+000D, permanently
767	* undefined Unicode characters (noncharacters), and control characters other
768	* than space characters (U+0009, U+000A, U+000C and U+000D) */
769	/ seems to allow surrogate characters, then /
770	return (uni_cp >= `0x20` && uni_cp <= `0x7E`) \|\|
771	(uni_cp >= `0x09` && uni_cp <= `0x0C` && uni_cp != `0x0B`) \|\| / form feed U+0C allowed, but not U+0D /
772	(uni_cp >= `0xA0` && uni_cp <= `0x10FFFF` &&
773	((uni_cp & `0xFFFF`) < `0xFFFE`) && / last two of each plane (nonchars) disallowed /
774	(uni_cp < `0xFDD0` \|\| uni_cp > `0xFDEF`)); / U+FDD0-U+FDEF (nonchars) disallowed /
775	case ENT_HTML_DOC_XHTML:
776	case ENT_HTML_DOC_XML1:
777	/ OTOH, XML 1.0 requires "character references to match the production for Char*
778	* See <http://www.w3.org/TR/REC-xml/#NT-CharRef> */
779	return unicode_cp_is_allowed(uni_cp, document_type);
780	default:
781	return `1`;
782	}
783	}
784	/ }}} /
785
786	/ {{{ process_numeric_entity*
787	* Auxiliary function to traverse_for_entities.
788	* On input, *buf should point to the first character after # and on output, it's the last
789	* byte read, no matter if there was success or insuccess.
790	*/
791	static inline int process_numeric_entity(const char *buf, unsigned* *code_point)
792	{
793	long code_l;
794	int hexadecimal = (buf == `'x'` \|\| buf == `'X'`); / TODO: XML apparently disallows "X" /
795	char *endptr;
796
797	if (hexadecimal && (**buf != `'\0'`))
798	(*buf)++;
799
800	/ strtol allows whitespace and other stuff in the beginning*
801	* we're not interested */
802	if ((hexadecimal && !isxdigit(**buf)) \|\|
803	(!hexadecimal && !isdigit(**buf))) {
804	return FAILURE;
805	}
806
807	code_l = strtol(*buf, &endptr, hexadecimal ? `16` : `10`);
808	/ we're guaranteed there were valid digits, so endptr > buf /*
809	*buf = endptr;
810
811	if (**buf != `';'`)
812	return FAILURE;
813
814	/ many more are invalid, but that depends on whether it's HTML*
815	* (and which version) or XML. */
816	if (code_l > `0x10FFFFL`)
817	return FAILURE;
818
819	if (code_point != NULL)
820	code_point = (unsigned*)code_l;
821
822	return SUCCESS;
823	}
824	/ }}} /
825
826	/ {{{ process_named_entity /
827	static inline int process_named_entity_html(const char *buf, const* char *start, size_t length)
828	{
829	start = buf;
830
831	/ "&" is represented by a 0x26 in all supported encodings. That means*
832	* the byte after represents a character or is the leading byte of an
833	* sequence of 8-bit code units. If in the ranges below, it represents
834	* necessarily a alpha character because none of the supported encodings
835	* has an overlap with ASCII in the leading byte (only on the second one) */
836	while ((buf >= `'a'` && buf <= `'z'`) \|\|
837	(buf >= `'A'` && buf <= `'Z'`) \|\|
838	(buf >= `'0'` && buf <= `'9'`)) {
839	(*buf)++;
840	}
841
842	if (**buf != `';'`)
843	return FAILURE;
844
845	/ cast to size_t OK as the quantity is always non-negative /
846	length = buf - *start;
847
848	if (*length == `0`)
849	return FAILURE;
850
851	return SUCCESS;
852	}
853	/ }}} /
854
855	/ {{{ resolve_named_entity_html /
856	static inline int resolve_named_entity_html(const char start, size_t length, const* entity_ht ht, unsigned* uni_cp1, unsigned* *uni_cp2)
857	{
858	const entity_cp_map *s;
859	ulong hash = zend_inline_hash_func(start, length);
860
861	s = ht->buckets[hash % ht->num_elems];
862	while (s->entity) {
863	if (s->entity_len == length) {
864	if (memcmp(start, s->entity, length) == `0`) {
865	*uni_cp1 = s->codepoint1;
866	*uni_cp2 = s->codepoint2;
867	return SUCCESS;
868	}
869	}
870	s++;
871	}
872	return FAILURE;
873	}
874	/ }}} /
875
876	static inline size_t write_octet_sequence(unsigned char buf, enum* entity_charset charset, unsigned code) {
877	/ code is not necessarily a unicode code point /
878	switch (charset) {
879	case cs_utf_8:
880	return php_utf32_utf8(buf, code);
881
882	case cs_8859_1:
883	case cs_cp1252:
884	case cs_8859_15:
885	case cs_koi8r:
886	case cs_cp1251:
887	case cs_8859_5:
888	case cs_cp866:
889	case cs_macroman:
890	/ single byte stuff /
891	*buf = code;
892	return `1`;
893
894	case cs_big5:
895	case cs_big5hkscs:
896	case cs_sjis:
897	case cs_gb2312:
898	/ we don't have complete unicode mappings for these yet in entity_decode,*
899	* and we opt to pass through the octet sequences for these in htmlentities
900	* instead of converting to an int and then converting back. */
901	#if 0
902	return php_mb2_int_to_char(buf, code);
903	#else
904	#if ZEND_DEBUG
905	assert(code <= `0xFFU`);
906	#endif
907	*buf = code;
908	return `1`;
909	#endif
910
911	case cs_eucjp:
912	#if 0 /* idem */
913	return php_mb2_int_to_char(buf, code);
914	#else
915	#if ZEND_DEBUG
916	assert(code <= `0xFFU`);
917	#endif
918	*buf = code;
919	return `1`;
920	#endif
921
922	default:
923	assert(`0`);
924	return `0`;
925	}
926	}
927
928	/ {{{ traverse_for_entities*
929	* Auxiliary function to php_unescape_html_entities().
930	* - The argument "all" determines if all numeric entities are decode or only those
931	* that correspond to quotes (depending on quote_style).
932	*/
933	/ maximum expansion (factor 1.2) for HTML 5 with &nGt; and &nLt; /
934	/ +2 is 1 because of rest (probably unnecessary), 1 because of terminating 0 /
935	#define TRAVERSE_FOR_ENTITIES_EXPAND_SIZE(oldlen) ((oldlen) + (oldlen) / 5 + 2)
936	static void traverse_for_entities(
937	const char *old,
938	size_t oldlen,
939	char ret, /* should have allocated TRAVERSE_FOR_ENTITIES_EXPAND_SIZE(olden) /
940	size_t *retlen,
941	int all,
942	int flags,
943	const entity_ht *inv_map,
944	enum entity_charset charset)
945	{
946	const char *p,
947	*lim;
948	char *q;
949	int doctype = flags & ENT_HTML_DOC_TYPE_MASK;
950
951	lim = old + oldlen; / terminator address /
952	assert(*lim == `'\0'`);
953
954	for (p = old, q = ret; p < lim;) {
955	unsigned code, code2 = `0`;
956	const char next = NULL; /* when set, next > p, otherwise possible inf loop /
957
958	/ Shift JIS, Big5 and HKSCS use multi-byte encodings where an*
959	* ASCII range byte can be part of a multi-byte sequence.
960	* However, they start at 0x40, therefore if we find a 0x26 byte,
961	* we're sure it represents the '&' character. */
962
963	/ assumes there are no single-char entities /
964	if (p[`0`] != `'&'` \|\| (p + `3` >= lim)) {
965	(q++) = (p++);
966	continue;
967	}
968
969	/ now p[3] is surely valid and is no terminator /
970
971	/ numerical entity /
972	if (p[`1`] == `'#'`) {
973	next = &p[`2`];
974	if (process_numeric_entity(&next, &code) == FAILURE)
975	goto invalid_code;
976
977	/ If we're in htmlspecialchars_decode, we're only decoding entities*
978	* that represent &, <, >, " and '. Is this one of them? */
979	if (!all && (code > `63U` \|\|
980	stage3_table_be_apos_00000[code].data.ent.entity == NULL))
981	goto invalid_code;
982
983	/ are we allowed to decode this entity in this document type?*
984	* HTML 5 is the only that has a character that cannot be used in
985	* a numeric entity but is allowed literally (U+000D). The
986	* unoptimized version would be ... \|\| !numeric_entity_is_allowed(code) */
987	if (!unicode_cp_is_allowed(code, doctype) \|\|
988	(doctype == ENT_HTML_DOC_HTML5 && code == `0x0D`))
989	goto invalid_code;
990	} else {
991	const char *start;
992	size_t ent_len;
993
994	next = &p[`1`];
995	start = next;
996
997	if (process_named_entity_html(&next, &start, &ent_len) == FAILURE)
998	goto invalid_code;
999
1000	if (resolve_named_entity_html(start, ent_len, inv_map, &code, &code2) == FAILURE) {
1001	if (doctype == ENT_HTML_DOC_XHTML && ent_len == `4` && start[`0`] == `'a'`
1002	&& start[`1`] == `'p'` && start[`2`] == `'o'` && start[`3`] == `'s'`) {
1003	/ uses html4 inv_map, which doesn't include apos;. This is a*
1004	* hack to support it */
1005	code = (unsigned) `'\''`;
1006	} else {
1007	goto invalid_code;
1008	}
1009	}
1010	}
1011
1012	assert(*next == `';'`);
1013
1014	if (((code == `'\''` && !(flags & ENT_HTML_QUOTE_SINGLE)) \|\|
1015	(code == `'"'` && !(flags & ENT_HTML_QUOTE_DOUBLE)))
1016	/ && code2 == '\0' always true for current maps /)
1017	goto invalid_code;
1018
1019	/ UTF-8 doesn't need mapping (ISO-8859-1 doesn't either, but*
1020	* the call is needed to ensure the codepoint <= U+00FF) */
1021	if (charset != cs_utf_8) {
1022	/ replace unicode code point /
1023	if (map_from_unicode(code, charset, &code) == FAILURE \|\| code2 != `0`)
1024	goto invalid_code; / not representable in target charset /
1025	}
1026
1027	q += write_octet_sequence(q, charset, code);
1028	if (code2) {
1029	q += write_octet_sequence(q, charset, code2);
1030	}
1031
1032	/ jump over the valid entity; may go beyond size of buffer; np /
1033	p = next + `1`;
1034	continue;
1035
1036	invalid_code:
1037	for (; p < next; p++) {
1038	(q++) = p;
1039	}
1040	}
1041
1042	*q = `'\0'`;
1043	*retlen = (size_t)(q - ret);
1044	}
1045	/ }}} /
1046
1047	/ {{{ unescape_inverse_map /
1048	static const entity_ht unescape_inverse_map(int* all, int flags)
1049	{
1050	int document_type = flags & ENT_HTML_DOC_TYPE_MASK;
1051
1052	if (all) {
1053	switch (document_type) {
1054	case ENT_HTML_DOC_HTML401:
1055	case ENT_HTML_DOC_XHTML: / but watch out for '.../
1056	return &ent_ht_html4;
1057	case ENT_HTML_DOC_HTML5:
1058	return &ent_ht_html5;
1059	default:
1060	return &ent_ht_be_apos;
1061	}
1062	} else {
1063	switch (document_type) {
1064	case ENT_HTML_DOC_HTML401:
1065	return &ent_ht_be_noapos;
1066	default:
1067	return &ent_ht_be_apos;
1068	}
1069	}
1070	}
1071	/ }}} /
1072
1073	/ {{{ determine_entity_table*
1074	* Entity table to use. Note that entity tables are defined in terms of
1075	* unicode code points */
1076	static entity_table_opt determine_entity_table(int all, int doctype)
1077	{
1078	entity_table_opt retval = {NULL};
1079
1080	assert(!(doctype == ENT_HTML_DOC_XML1 && all));
1081
1082	if (all) {
1083	retval.ms_table = (doctype == ENT_HTML_DOC_HTML5) ?
1084	entity_ms_table_html5 : entity_ms_table_html4;
1085	} else {
1086	retval.table = (doctype == ENT_HTML_DOC_HTML401) ?
1087	stage3_table_be_noapos_00000 : stage3_table_be_apos_00000;
1088	}
1089	return retval;
1090	}
1091	/ }}} /
1092
1093	/ {{{ php_unescape_html_entities*
1094	* The parameter "all" should be true to decode all possible entities, false to decode
1095	* only the basic ones, i.e., those in basic_entities_ex + the numeric entities
1096	* that correspond to quotes.
1097	*/
1098	PHPAPI char php_unescape_html_entities(unsigned* char old, size_t oldlen, size_t newlen, int all, int flags, char *hint_charset TSRMLS_DC)
1099	{
1100	size_t retlen;
1101	char *ret;
1102	enum entity_charset charset;
1103	const entity_ht *inverse_map = NULL;
1104	size_t new_size = TRAVERSE_FOR_ENTITIES_EXPAND_SIZE(oldlen);
1105
1106	if (all) {
1107	charset = determine_charset(hint_charset TSRMLS_CC);
1108	} else {
1109	charset = cs_8859_1; / charset shouldn't matter, use ISO-8859-1 for performance /
1110	}
1111
1112	/ don't use LIMIT_ALL! /
1113
1114	if (oldlen > new_size) {
1115	/ overflow, refuse to do anything /
1116	ret = estrndup((char*)old, oldlen);
1117	retlen = oldlen;
1118	goto empty_source;
1119	}
1120	ret = emalloc(new_size);
1121	*ret = `'\0'`;
1122	retlen = oldlen;
1123	if (retlen == `0`) {
1124	goto empty_source;
1125	}
1126
1127	inverse_map = unescape_inverse_map(all, flags);
1128
1129	/ replace numeric entities /
1130	traverse_for_entities(old, oldlen, ret, &retlen, all, flags, inverse_map, charset);
1131
1132	empty_source:
1133	*newlen = retlen;
1134	return ret;
1135	}
1136	/ }}} /
1137
1138	PHPAPI char php_escape_html_entities(unsigned* char old, size_t oldlen, size_t newlen, int all, int flags, char *hint_charset TSRMLS_DC)
1139	{
1140	return php_escape_html_entities_ex(old, oldlen, newlen, all, flags, hint_charset, `1` TSRMLS_CC);
1141	}
1142
1143	/ {{{ find_entity_for_char /
1144	static inline void find_entity_for_char(
1145	unsigned int k,
1146	enum entity_charset charset,
1147	const entity_stage1_row *table,
1148	const unsigned char **entity,
1149	size_t *entity_len,
1150	unsigned char *old,
1151	size_t oldlen,
1152	size_t *cursor)
1153	{
1154	unsigned stage1_idx = ENT_STAGE1_INDEX(k);
1155	const entity_stage3_row *c;
1156
1157	if (stage1_idx > `0x1D`) {
1158	*entity = NULL;
1159	*entity_len = `0`;
1160	return;
1161	}
1162
1163	c = &table[stage1_idx][ENT_STAGE2_INDEX(k)][ENT_STAGE3_INDEX(k)];
1164
1165	if (!c->ambiguous) {
1166	entity = (const* unsigned char *)c->data.ent.entity;
1167	*entity_len = c->data.ent.entity_len;
1168	} else {
1169	/ peek at next char /
1170	size_t cursor_before = *cursor;
1171	int status = SUCCESS;
1172	unsigned next_char;
1173
1174	if (!(*cursor < oldlen))
1175	goto no_suitable_2nd;
1176
1177	next_char = get_next_char(charset, old, oldlen, cursor, &status);
1178
1179	if (status == FAILURE)
1180	goto no_suitable_2nd;
1181
1182	{
1183	const entity_multicodepoint_row s, e;
1184
1185	s = &c->data.multicodepoint_table[`1`];
1186	e = s - `1` + c->data.multicodepoint_table[`0`].leading_entry.size;
1187	/ we could do a binary search but it's not worth it since we have*
1188	* at most two entries... */
1189	for ( ; s <= e; s++) {
1190	if (s->normal_entry.second_cp == next_char) {
1191	*entity = s->normal_entry.entity;
1192	*entity_len = s->normal_entry.entity_len;
1193	return;
1194	}
1195	}
1196	}
1197	no_suitable_2nd:
1198	*cursor = cursor_before;
1199	entity = (const* unsigned char *)
1200	c->data.multicodepoint_table[`0`].leading_entry.default_entity;
1201	*entity_len = c->data.multicodepoint_table[`0`].leading_entry.default_entity_len;
1202	}
1203	}
1204	/ }}} /
1205
1206	/ {{{ find_entity_for_char_basic /
1207	static inline void find_entity_for_char_basic(
1208	unsigned int k,
1209	const entity_stage3_row *table,
1210	const unsigned char **entity,
1211	size_t *entity_len)
1212	{
1213	if (k >= `64U`) {
1214	*entity = NULL;
1215	*entity_len = `0`;
1216	return;
1217	}
1218
1219	*entity = table[k].data.ent.entity;
1220	*entity_len = table[k].data.ent.entity_len;
1221	}
1222	/ }}} /
1223
1224	/ {{{ php_escape_html_entities*
1225	*/
1226	PHPAPI char php_escape_html_entities_ex(unsigned* char old, size_t oldlen, size_t newlen, int all, int flags, char *hint_charset, zend_bool double_encode TSRMLS_DC)
1227	{
1228	size_t cursor, maxlen, len;
1229	char *replaced;
1230	enum entity_charset charset = determine_charset(hint_charset TSRMLS_CC);
1231	int doctype = flags & ENT_HTML_DOC_TYPE_MASK;
1232	entity_table_opt entity_table;
1233	const enc_to_uni *to_uni_table = NULL;
1234	const entity_ht inv_map = NULL; /* used for !double_encode /
1235	/ only used if flags includes ENT_HTML_IGNORE_ERRORS or ENT_HTML_SUBSTITUTE_DISALLOWED_CHARS /
1236	const unsigned char *replacement = NULL;
1237	size_t replacement_len = `0`;
1238
1239	if (all) { / replace with all named entities /
1240	if (CHARSET_PARTIAL_SUPPORT(charset)) {
1241	php_error_docref0(NULL TSRMLS_CC, E_STRICT, "Only basic entities "
1242	"substitution is supported for multi-byte encodings other than UTF-8; "
1243	"functionality is equivalent to htmlspecialchars");
1244	}
1245	LIMIT_ALL(all, doctype, charset);
1246	}
1247	entity_table = determine_entity_table(all, doctype);
1248	if (all && !CHARSET_UNICODE_COMPAT(charset)) {
1249	to_uni_table = enc_to_uni_index[charset];
1250	}
1251
1252	if (!double_encode) {
1253	/ first arg is 1 because we want to identify valid named entities*
1254	* even if we are only encoding the basic ones */
1255	inv_map = unescape_inverse_map(`1`, flags);
1256	}
1257
1258	if (flags & (ENT_HTML_SUBSTITUTE_ERRORS \| ENT_HTML_SUBSTITUTE_DISALLOWED_CHARS)) {
1259	if (charset == cs_utf_8) {
1260	replacement = (const unsigned char*)"\xEF\xBF\xBD";
1261	replacement_len = sizeof("\xEF\xBF\xBD") - `1`;
1262	} else {
1263	replacement = (const unsigned char*)"�";
1264	replacement_len = sizeof("�") - `1`;
1265	}
1266	}
1267
1268	/ initial estimate /
1269	if (oldlen < `64`) {
1270	maxlen = `128`;
1271	} else {
1272	maxlen = `2` * oldlen;
1273	if (maxlen < oldlen) {
1274	zend_error_noreturn(E_ERROR, "Input string is too long");
1275	return NULL;
1276	}
1277	}
1278
1279	replaced = emalloc(maxlen + `1`); / adding 1 is safe: maxlen is even /
1280	len = `0`;
1281	cursor = `0`;
1282	while (cursor < oldlen) {
1283	const unsigned char *mbsequence = NULL;
1284	size_t mbseqlen = `0`,
1285	cursor_before = cursor;
1286	int status = SUCCESS;
1287	unsigned int this_char = get_next_char(charset, old, oldlen, &cursor, &status);
1288
1289	/ guarantee we have at least 40 bytes to write.*
1290	* In HTML5, entities may take up to 33 bytes */
1291	if (len > maxlen - `40`) { / maxlen can never be smaller than 128 /
1292	replaced = safe_erealloc(replaced, maxlen , `1`, `128` + `1`);
1293	maxlen += `128`;
1294	}
1295
1296	if (status == FAILURE) {
1297	/ invalid MB sequence /
1298	if (flags & ENT_HTML_IGNORE_ERRORS) {
1299	continue;
1300	} else if (flags & ENT_HTML_SUBSTITUTE_ERRORS) {
1301	memcpy(&replaced[len], replacement, replacement_len);
1302	len += replacement_len;
1303	continue;
1304	} else {
1305	efree(replaced);
1306	*newlen = `0`;
1307	return STR_EMPTY_ALLOC();
1308	}
1309	} else { / SUCCESS /
1310	mbsequence = &old[cursor_before];
1311	mbseqlen = cursor - cursor_before;
1312	}
1313
1314	if (this_char != `'&'`) { / no entity on this position /
1315	const unsigned char *rep = NULL;
1316	size_t rep_len = `0`;
1317
1318	if (((this_char == `'\''` && !(flags & ENT_HTML_QUOTE_SINGLE)) \|\|
1319	(this_char == `'"'` && !(flags & ENT_HTML_QUOTE_DOUBLE))))
1320	goto pass_char_through;
1321
1322	if (all) { / false that CHARSET_PARTIAL_SUPPORT(charset) /
1323	if (to_uni_table != NULL) {
1324	/ !CHARSET_UNICODE_COMPAT therefore not UTF-8; since UTF-8*
1325	* is the only multibyte encoding with !CHARSET_PARTIAL_SUPPORT,
1326	* we're using a single byte encoding */
1327	map_to_unicode(this_char, to_uni_table, &this_char);
1328	if (this_char == `0xFFFF`) / no mapping; pass through /
1329	goto pass_char_through;
1330	}
1331	/ the cursor may advance /
1332	find_entity_for_char(this_char, charset, entity_table.ms_table, &rep,
1333	&rep_len, old, oldlen, &cursor);
1334	} else {
1335	find_entity_for_char_basic(this_char, entity_table.table, &rep, &rep_len);
1336	}
1337
1338	if (rep != NULL) {
1339	replaced[len++] = `'&'`;
1340	memcpy(&replaced[len], rep, rep_len);
1341	len += rep_len;
1342	replaced[len++] = `';'`;
1343	} else {
1344	/ we did not find an entity for this char.*
1345	* check for its validity, if its valid pass it unchanged */
1346	if (flags & ENT_HTML_SUBSTITUTE_DISALLOWED_CHARS) {
1347	if (CHARSET_UNICODE_COMPAT(charset)) {
1348	if (!unicode_cp_is_allowed(this_char, doctype)) {
1349	mbsequence = replacement;
1350	mbseqlen = replacement_len;
1351	}
1352	} else if (to_uni_table) {
1353	if (!all) / otherwise we already did this /
1354	map_to_unicode(this_char, to_uni_table, &this_char);
1355	if (!unicode_cp_is_allowed(this_char, doctype)) {
1356	mbsequence = replacement;
1357	mbseqlen = replacement_len;
1358	}
1359	} else {
1360	/ not a unicode code point, unless, coincidentally, it's in*
1361	* the 0x20..0x7D range (except 0x5C in sjis). We know nothing
1362	* about other code points, because we have no tables. Since
1363	* Unicode code points in that range are not disallowed in any
1364	* document type, we could do nothing. However, conversion
1365	* tables frequently map 0x00-0x1F to the respective C0 code
1366	* points. Let's play it safe and admit that's the case */
1367	if (this_char <= `0x7D` &&
1368	!unicode_cp_is_allowed(this_char, doctype)) {
1369	mbsequence = replacement;
1370	mbseqlen = replacement_len;
1371	}
1372	}
1373	}
1374	pass_char_through:
1375	if (mbseqlen > `1`) {
1376	memcpy(replaced + len, mbsequence, mbseqlen);
1377	len += mbseqlen;
1378	} else {
1379	replaced[len++] = mbsequence[`0`];
1380	}
1381	}
1382	} else { / this_char == '&' /
1383	if (double_encode) {
1384	encode_amp:
1385	memcpy(&replaced[len], "&", sizeof("&") - `1`);
1386	len += sizeof("&") - `1`;
1387	} else { / no double encode /
1388	/ check if entity is valid /
1389	size_t ent_len; / not counting & or ; /
1390	/ peek at next char /
1391	if (old[cursor] == `'#'`) { / numeric entity /
1392	unsigned code_point;
1393	int valid;
1394	char pos = (char**)&old[cursor+`1`];
1395	valid = process_numeric_entity((const char **)&pos, &code_point);
1396	if (valid == FAILURE)
1397	goto encode_amp;
1398	if (flags & ENT_HTML_SUBSTITUTE_DISALLOWED_CHARS) {
1399	if (!numeric_entity_is_allowed(code_point, doctype))
1400	goto encode_amp;
1401	}
1402	ent_len = pos - (char*)&old[cursor];
1403	} else { / named entity /
1404	/ check for vality of named entity /
1405	const char *start = &old[cursor],
1406	*next = start;
1407	unsigned dummy1, dummy2;
1408
1409	if (process_named_entity_html(&next, &start, &ent_len) == FAILURE)
1410	goto encode_amp;
1411	if (resolve_named_entity_html(start, ent_len, inv_map, &dummy1, &dummy2) == FAILURE) {
1412	if (!(doctype == ENT_HTML_DOC_XHTML && ent_len == `4` && start[`0`] == `'a'`
1413	&& start[`1`] == `'p'` && start[`2`] == `'o'` && start[`3`] == `'s'`)) {
1414	/ uses html4 inv_map, which doesn't include apos;. This is a*
1415	* hack to support it */
1416	goto encode_amp;
1417	}
1418	}
1419	}
1420	/ checks passed; copy entity to result /
1421	/ entity size is unbounded, we may need more memory /
1422	/ at this point maxlen - len >= 40 /
1423	if (maxlen - len < ent_len + `2` / & and ; /) {
1424	/ ent_len < oldlen, which is certainly <= SIZE_MAX/2 /
1425	replaced = safe_erealloc(replaced, maxlen, `1`, ent_len + `128` + `1`);
1426	maxlen += ent_len + `128`;
1427	}
1428	replaced[len++] = `'&'`;
1429	memcpy(&replaced[len], &old[cursor], ent_len);
1430	len += ent_len;
1431	replaced[len++] = `';'`;
1432	cursor += ent_len + `1`;
1433	}
1434	}
1435	}
1436	replaced[len] = `'\0'`;
1437	*newlen = len;
1438
1439	return replaced;
1440	}
1441	/ }}} /
1442
1443	/ {{{ php_html_entities*
1444	*/
1445	static void php_html_entities(INTERNAL_FUNCTION_PARAMETERS, int all)
1446	{
1447	char str, hint_charset = NULL;
1448	int str_len, hint_charset_len = `0`;
1449	size_t new_len;
1450	long flags = ENT_COMPAT;
1451	char *replaced;
1452	zend_bool double_encode = `1`;
1453
1454	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s\|ls!b", &str, &str_len, &flags, &hint_charset, &hint_charset_len, &double_encode) == FAILURE) {
1455	return;
1456	}
1457
1458	if (!hint_charset) {
1459	hint_charset = get_default_charset(TSRMLS_C);
1460	}
1461	replaced = php_escape_html_entities_ex(str, str_len, &new_len, all, (int) flags, hint_charset, double_encode TSRMLS_CC);
1462
1463	RETVAL_STRINGL(replaced, (int)new_len, `0`);
1464	}
1465	/ }}} /
1466
1467	#define HTML_SPECIALCHARS 0
1468	#define HTML_ENTITIES 1
1469
1470	/ {{{ register_html_constants*
1471	*/
1472	void register_html_constants(INIT_FUNC_ARGS)
1473	{
1474	REGISTER_LONG_CONSTANT("HTML_SPECIALCHARS", HTML_SPECIALCHARS, CONST_PERSISTENT\|CONST_CS);
1475	REGISTER_LONG_CONSTANT("HTML_ENTITIES", HTML_ENTITIES, CONST_PERSISTENT\|CONST_CS);
1476	REGISTER_LONG_CONSTANT("ENT_COMPAT", ENT_COMPAT, CONST_PERSISTENT\|CONST_CS);
1477	REGISTER_LONG_CONSTANT("ENT_QUOTES", ENT_QUOTES, CONST_PERSISTENT\|CONST_CS);
1478	REGISTER_LONG_CONSTANT("ENT_NOQUOTES", ENT_NOQUOTES, CONST_PERSISTENT\|CONST_CS);
1479	REGISTER_LONG_CONSTANT("ENT_IGNORE", ENT_IGNORE, CONST_PERSISTENT\|CONST_CS);
1480	REGISTER_LONG_CONSTANT("ENT_SUBSTITUTE", ENT_SUBSTITUTE, CONST_PERSISTENT\|CONST_CS);
1481	REGISTER_LONG_CONSTANT("ENT_DISALLOWED", ENT_DISALLOWED, CONST_PERSISTENT\|CONST_CS);
1482	REGISTER_LONG_CONSTANT("ENT_HTML401", ENT_HTML401, CONST_PERSISTENT\|CONST_CS);
1483	REGISTER_LONG_CONSTANT("ENT_XML1", ENT_XML1, CONST_PERSISTENT\|CONST_CS);
1484	REGISTER_LONG_CONSTANT("ENT_XHTML", ENT_XHTML, CONST_PERSISTENT\|CONST_CS);
1485	REGISTER_LONG_CONSTANT("ENT_HTML5", ENT_HTML5, CONST_PERSISTENT\|CONST_CS);
1486	}
1487	/ }}} /
1488
1489	/ {{{ proto string htmlspecialchars(string string [, int quote_style[, string charset[, bool double_encode]]])*
1490	Convert special characters to HTML entities /*
1491	PHP_FUNCTION(htmlspecialchars)
1492	{
1493	php_html_entities(INTERNAL_FUNCTION_PARAM_PASSTHRU, `0`);
1494	}
1495	/ }}} /
1496
1497	/ {{{ proto string htmlspecialchars_decode(string string [, int quote_style])*
1498	Convert special HTML entities back to characters /*
1499	PHP_FUNCTION(htmlspecialchars_decode)
1500	{
1501	char *str;
1502	int str_len;
1503	size_t new_len = `0`;
1504	long quote_style = ENT_COMPAT;
1505	char *replaced;
1506
1507	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s\|l", &str, &str_len, &quote_style) == FAILURE) {
1508	return;
1509	}
1510
1511	replaced = php_unescape_html_entities(str, str_len, &new_len, `0` /!all/, quote_style, NULL TSRMLS_CC);
1512	if (replaced) {
1513	RETURN_STRINGL(replaced, (int)new_len, `0`);
1514	}
1515	RETURN_FALSE;
1516	}
1517	/ }}} /
1518
1519	/ {{{ proto string html_entity_decode(string string [, int quote_style][, string charset])*
1520	Convert all HTML entities to their applicable characters /*
1521	PHP_FUNCTION(html_entity_decode)
1522	{
1523	char str, hint_charset = NULL;
1524	int str_len, hint_charset_len;
1525	size_t new_len = `0`;
1526	long quote_style = ENT_COMPAT;
1527	char *replaced;
1528
1529	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s\|ls", &str, &str_len,
1530	&quote_style, &hint_charset, &hint_charset_len) == FAILURE) {
1531	return;
1532	}
1533
1534	if (!hint_charset) {
1535	hint_charset = get_default_charset(TSRMLS_C);
1536	}
1537	replaced = php_unescape_html_entities(str, str_len, &new_len, `1` /all/, quote_style, hint_charset TSRMLS_CC);
1538
1539	if (replaced) {
1540	RETURN_STRINGL(replaced, (int)new_len, `0`);
1541	}
1542	RETURN_FALSE;
1543	}
1544	/ }}} /
1545
1546
1547	/ {{{ proto string htmlentities(string string [, int quote_style[, string charset[, bool double_encode]]])*
1548	Convert all applicable characters to HTML entities /*
1549	PHP_FUNCTION(htmlentities)
1550	{
1551	php_html_entities(INTERNAL_FUNCTION_PARAM_PASSTHRU, `1`);
1552	}
1553	/ }}} /
1554
1555	/ {{{ write_s3row_data /
1556	static inline void write_s3row_data(
1557	const entity_stage3_row *r,
1558	unsigned orig_cp,
1559	enum entity_charset charset,
1560	zval *arr)
1561	{
1562	char key[`9`] = ""; / two unicode code points in UTF-8 /
1563	char entity[LONGEST_ENTITY_LENGTH + `2`] = {`'&'`};
1564	size_t written_k1;
1565
1566	written_k1 = write_octet_sequence(key, charset, orig_cp);
1567
1568	if (!r->ambiguous) {
1569	size_t l = r->data.ent.entity_len;
1570	memcpy(&entity[`1`], r->data.ent.entity, l);
1571	entity[l + `1`] = `';'`;
1572	add_assoc_stringl_ex(arr, key, written_k1 + `1`, entity, l + `2`, `1`);
1573	} else {
1574	unsigned i,
1575	num_entries;
1576	const entity_multicodepoint_row *mcpr = r->data.multicodepoint_table;
1577
1578	if (mcpr[`0`].leading_entry.default_entity != NULL) {
1579	size_t l = mcpr[`0`].leading_entry.default_entity_len;
1580	memcpy(&entity[`1`], mcpr[`0`].leading_entry.default_entity, l);
1581	entity[l + `1`] = `';'`;
1582	add_assoc_stringl_ex(arr, key, written_k1 + `1`, entity, l + `2`, `1`);
1583	}
1584	num_entries = mcpr[`0`].leading_entry.size;
1585	for (i = `1`; i <= num_entries; i++) {
1586	size_t l,
1587	written_k2;
1588	unsigned uni_cp,
1589	spe_cp;
1590
1591	uni_cp = mcpr[i].normal_entry.second_cp;
1592	l = mcpr[i].normal_entry.entity_len;
1593
1594	if (!CHARSET_UNICODE_COMPAT(charset)) {
1595	if (map_from_unicode(uni_cp, charset, &spe_cp) == FAILURE)
1596	continue; / non representable in this charset /
1597	} else {
1598	spe_cp = uni_cp;
1599	}
1600
1601	written_k2 = write_octet_sequence(&key[written_k1], charset, spe_cp);
1602	memcpy(&entity[`1`], mcpr[i].normal_entry.entity, l);
1603	entity[l + `1`] = `';'`;
1604	entity[l + `1`] = `'\0'`;
1605	add_assoc_stringl_ex(arr, key, written_k1 + written_k2 + `1`, entity, l + `1`, `1`);
1606	}
1607	}
1608	}
1609	/ }}} /
1610
1611	/ {{{ proto array get_html_translation_table([int table [, int flags [, string charset_hint]]])*
1612	Returns the internal translation table used by htmlspecialchars and htmlentities /*
1613	PHP_FUNCTION(get_html_translation_table)
1614	{
1615	long all = HTML_SPECIALCHARS,
1616	flags = ENT_COMPAT;
1617	int doctype;
1618	entity_table_opt entity_table;
1619	const enc_to_uni *to_uni_table = NULL;
1620	char *charset_hint = NULL;
1621	int charset_hint_len;
1622	enum entity_charset charset;
1623
1624	/ in this function we have to jump through some loops because we're*
1625	* getting the translated table from data structures that are optimized for
1626	* random access, not traversal */
1627
1628	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "\|lls",
1629	&all, &flags, &charset_hint, &charset_hint_len) == FAILURE) {
1630	return;
1631	}
1632
1633	charset = determine_charset(charset_hint TSRMLS_CC);
1634	doctype = flags & ENT_HTML_DOC_TYPE_MASK;
1635	LIMIT_ALL(all, doctype, charset);
1636
1637	array_init(return_value);
1638
1639	entity_table = determine_entity_table(all, doctype);
1640	if (all && !CHARSET_UNICODE_COMPAT(charset)) {
1641	to_uni_table = enc_to_uni_index[charset];
1642	}
1643
1644	if (all) { / HTML_ENTITIES (actually, any non-zero value for 1st param) /
1645	const entity_stage1_row *ms_table = entity_table.ms_table;
1646
1647	if (CHARSET_UNICODE_COMPAT(charset)) {
1648	unsigned i, j, k,
1649	max_i, max_j, max_k;
1650	/ no mapping to unicode required /
1651	if (CHARSET_SINGLE_BYTE(charset)) { / ISO-8859-1 /
1652	max_i = `1`; max_j = `4`; max_k = `64`;
1653	} else {
1654	max_i = `0x1E`; max_j = `64`; max_k = `64`;
1655	}
1656
1657	for (i = `0`; i < max_i; i++) {
1658	if (ms_table[i] == empty_stage2_table)
1659	continue;
1660	for (j = `0`; j < max_j; j++) {
1661	if (ms_table[i][j] == empty_stage3_table)
1662	continue;
1663	for (k = `0`; k < max_k; k++) {
1664	const entity_stage3_row *r = &ms_table[i][j][k];
1665	unsigned code;
1666
1667	if (r->data.ent.entity == NULL)
1668	continue;
1669
1670	code = ENT_CODE_POINT_FROM_STAGES(i, j, k);
1671	if (((code == `'\''` && !(flags & ENT_HTML_QUOTE_SINGLE)) \|\|
1672	(code == `'"'` && !(flags & ENT_HTML_QUOTE_DOUBLE))))
1673	continue;
1674	write_s3row_data(r, code, charset, return_value);
1675	}
1676	}
1677	}
1678	} else {
1679	/ we have to iterate through the set of code points for this*
1680	* encoding and map them to unicode code points */
1681	unsigned i;
1682	for (i = `0`; i <= `0xFF`; i++) {
1683	const entity_stage3_row *r;
1684	unsigned uni_cp;
1685
1686	/ can be done before mapping, they're invariant /
1687	if (((i == `'\''` && !(flags & ENT_HTML_QUOTE_SINGLE)) \|\|
1688	(i == `'"'` && !(flags & ENT_HTML_QUOTE_DOUBLE))))
1689	continue;
1690
1691	map_to_unicode(i, to_uni_table, &uni_cp);
1692	r = &ms_table[ENT_STAGE1_INDEX(uni_cp)][ENT_STAGE2_INDEX(uni_cp)][ENT_STAGE3_INDEX(uni_cp)];
1693	if (r->data.ent.entity == NULL)
1694	continue;
1695
1696	write_s3row_data(r, i, charset, return_value);
1697	}
1698	}
1699	} else {
1700	/ we could use sizeof(stage3_table_be_apos_00000) as well /
1701	unsigned j,
1702	numelems = sizeof(stage3_table_be_noapos_00000) /
1703	sizeof(*stage3_table_be_noapos_00000);
1704
1705	for (j = `0`; j < numelems; j++) {
1706	const entity_stage3_row *r = &entity_table.table[j];
1707	if (r->data.ent.entity == NULL)
1708	continue;
1709
1710	if (((j == `'\''` && !(flags & ENT_HTML_QUOTE_SINGLE)) \|\|
1711	(j == `'"'` && !(flags & ENT_HTML_QUOTE_DOUBLE))))
1712	continue;
1713
1714	/ charset is indifferent, used cs_8859_1 for efficiency /
1715	write_s3row_data(r, j, cs_8859_1, return_value);
1716	}
1717	}
1718	}
1719	/ }}} /
1720
1721	/*
1722	* Local variables:
1723	* tab-width: 4
1724	* c-basic-offset: 4
1725	* End:
1726	* vim600: sw=4 ts=4 fdm=marker
1727	* vim<600: sw=4 ts=4
1728	*/
1729

Browse the source code of php/ext/standard/html.c