1
+ #include "charset.h"
2
+
3
+ struct vstr * get_encoding (FILE * stream , num_bytes_meta ) {
4
+
5
+ //Read the first 512 bytes of the stream into an array
6
+ char buf [num_bytes_meta + 1 ];
7
+ int n ;
8
+ struct vstr * encoding ;
9
+
10
+ encoding = vstr_new (8 );
11
+
12
+ n = fread (buf , 1 , BUF_SIZE , stream );
13
+ //Reset the buffer position
14
+ fseek (stream , - n , SEEK_CUR );
15
+ buf [n ] = '\0' ;
16
+
17
+ //Check for byte order marks
18
+ if (strncmp (buf , "\xfe\xff" , 2 ) == 0 ) {
19
+ vstr_append (encoding , "UTF-16BE" );
20
+ } else if (strncmp (buf , "\xff\xfe" , 2 ) == 0 ) {
21
+ vstr_append (encoding , "UTF-16LE" );
22
+ } else if (strncmp (buf , "\xef\xbb\xbf" , 3 ) == 0 ) {
23
+ vstr_append (encoding , "UTF-8" );
24
+ } else {
25
+ //Run autodetect algorithm
26
+ detect_encoding (& buf [0 ], encoding );
27
+ }
28
+
29
+ return encoding ;
30
+ };
31
+
32
+ void detect_encoding (char * buf , struct vstr * encoding ) {
33
+ while (* buf != '\0' ) {
34
+ if (strncmp (buf , "<!--" , 4 ) == 0 ) {
35
+ buf = jump_to (buf , "-->" );
36
+ } else if (strncmp (buf , "<meta" , 4 ) == 0 ) {
37
+ buf = handle_meta (buf , encoding );
38
+ if (* (encoding -> str ) != '\0' ) {
39
+ break ;
40
+ }
41
+ } else if (* buf == '<' && isalpha ((int )* (buf + 1 ))) {
42
+ buf = handle_tag (buf );
43
+ } else if (* buf == '<' && * (buf + 1 ) == '/' && isalpha ((int )* (buf + 2 ))) {
44
+ buf = handle_tag (buf );
45
+ } else if (strncmp (buf , "<!" , 2 ) == 0 || strncmp (buf , "</" , 2 ) == 0 || strncmp (buf , "<?" , 2 ) == 0 ) {
46
+ buf = jump_to (buf , ">" );
47
+ if (buf == NULL ) {
48
+ break ;
49
+ }
50
+ }
51
+ buf ++ ;
52
+ }
53
+ };
54
+
55
+ struct attr * attr_new () {
56
+ struct attr * new_attr ;
57
+ new_attr = (struct attr * )malloc (sizeof (struct attr ));
58
+ return new_attr ;
59
+ };
60
+
61
+ char * handle_meta (char * buf , struct vstr * encoding ) {
62
+ struct attr * attr_value ;
63
+ buf += 5 ; //Point to the character after the a
64
+ if (isspace ((int )* buf ) == 0 ) {
65
+ //The next character is not a space so treat it as an ordinary tag
66
+ buf -= 5 ;
67
+ buf = handle_tag (buf );
68
+ } else {
69
+ attr_value = attr_new ();
70
+ buf = get_attr (buf , attr_value );
71
+ while (* (attr_value -> name -> str ) != '\0' ) {
72
+ if (vstr_cmp (attr_value -> name , "charset" ) == 0 ) {
73
+ if (is_encoding (attr_value -> value )) {
74
+ vstr_append (encoding , attr_value -> value -> str );
75
+ break ;
76
+ }
77
+ } else if (vstr_cmp (attr_value -> name , "content" )) {
78
+ //Parse the content value
79
+ struct vstr * content_encoding = handle_content_type (attr_value -> value );
80
+ if (* (content_encoding -> str ) != '\0' && is_encoding (content_encoding )) {
81
+ vstr_append (encoding , content_encoding -> str );
82
+ break ;
83
+ }
84
+ vstr_free (content_encoding );
85
+ }
86
+ buf = get_attr (buf , attr_value );
87
+ }
88
+ free (attr_value );
89
+ }
90
+ return buf ;
91
+ };
92
+
93
+ char * jump_to (char * str , char * target ) {
94
+ //Return pointer to the last byte in the first match of target in str or null if is not present;
95
+ while (1 ) {
96
+ //Find a matching first character
97
+ while (* str != '\0' && * str != * target ) {
98
+ str ++ ;
99
+ }
100
+ if (* str == '\0' ) {
101
+ str = NULL ;
102
+ break ;
103
+ } else if (strncmp (str , target , strlen (target )) == 0 ) {
104
+ str += strlen (target )- 1 ;
105
+ break ;
106
+ }
107
+ }
108
+ return str ;
109
+ };
110
+
111
+ char * handle_tag (char * buf ) {
112
+ int skip_chars ;
113
+ struct attr * attr_value ;
114
+ buf ++ ;
115
+
116
+ skip_chars = strcspn (buf , "\t\n\f\v\f\r /><" );
117
+ buf += skip_chars ;
118
+
119
+ if (* buf == '<' ) {
120
+ buf -= 1 ; // This will be added back on in the caller
121
+ return buf ;
122
+ };
123
+
124
+ attr_value = attr_new ();
125
+ buf = get_attr (buf , attr_value );
126
+ while (* (attr_value -> name -> str ) != '\0' && buf != '\0' ) {
127
+ buf = get_attr (buf , attr_value );
128
+ };
129
+ free (attr_value );
130
+
131
+ return buf ;
132
+ };
133
+
134
+ char * get_attr (char * buf , struct attr * attr_value ) {
135
+ int skip_chars ;
136
+ char quote [1 ];
137
+ char lcase_letter [1 ];
138
+
139
+ int spaces = 0 ; //Do the spaces step
140
+
141
+ attr_value -> name = vstr_new (8 );
142
+ attr_value -> value = vstr_new (8 );
143
+
144
+ * (attr_value -> name -> str ) = '\0' ;
145
+ * (attr_value -> value -> str ) = '\0' ;
146
+ skip_chars = strspn (buf , "\t\n\f\v\f\r /" );
147
+ buf += skip_chars ;
148
+ if (* buf == '\0' || * buf == '<' || * buf == '>' ) {
149
+ if (* buf == '<' ) {
150
+ buf -= 1 ;
151
+ }
152
+ return buf ;
153
+ }
154
+
155
+ while (1 ) {
156
+ if (* buf == '\0' ) {
157
+ return buf ;
158
+ } else if (* buf == '=' && strlen (attr_value -> name -> str ) != 0 ) {
159
+ buf ++ ;
160
+ break ;
161
+ } else if (isspace ((int )(* buf ))){
162
+ spaces = 1 ;
163
+ break ;
164
+ } else if (* buf == '/' || * buf == '<' || * buf == '>' ) {
165
+ return buf ;
166
+ } else if (isupper ((int )(* buf ))) {
167
+ lcase_letter [0 ] = (char )tolower ((int )(* buf ));
168
+ vstr_append_n (attr_value -> name , lcase_letter , 1 );
169
+ } else {
170
+ vstr_append_n (attr_value -> name , buf , 1 );
171
+ }
172
+ buf ++ ;
173
+ }
174
+
175
+ if (spaces ) {
176
+ buf = skip_space (buf );
177
+ if (* buf != '=' ) {
178
+ buf -= 1 ;
179
+ return buf ;
180
+ } else {
181
+ buf ++ ;
182
+ }
183
+ }
184
+
185
+ buf = skip_space (buf );
186
+ if (* buf == '\'' || * buf == '"' ) {
187
+ quote [0 ] = * buf ;
188
+ buf ++ ;
189
+ while (* buf != quote [0 ] && * buf != '\0' ) {
190
+ if (isupper ((int )(* buf ))) {
191
+ vstr_append_n (attr_value -> value , (char * )tolower ((int )(* buf )), 1 );
192
+ } else {
193
+ vstr_append_n (attr_value -> value , buf , 1 );
194
+ }
195
+ buf ++ ;
196
+ }
197
+ //XXX need to advance position here
198
+ if (* buf == quote [0 ]) {
199
+ buf ++ ;
200
+ }
201
+ return buf ;
202
+ } else if (* buf == '<' || * buf == '>' || * buf == '\0' ){
203
+ return buf ;
204
+ } else if (isupper ((int )(* buf ))) {
205
+ lcase_letter [0 ] = (char )tolower ((int )(* buf ));
206
+ vstr_append_n (attr_value -> value , lcase_letter , 1 );
207
+ } else {
208
+ vstr_append_n (attr_value -> value , buf , 1 );
209
+ };
210
+ buf ++ ;
211
+ while (buf != '\0' ) {
212
+ if (isspace ((int )(* buf )) || * buf == '<' || * buf == '>' ) {
213
+ return buf ;
214
+ } else if (isupper ((int )(* buf ))) {
215
+ lcase_letter [0 ] = (char )tolower ((int )(* buf ));
216
+ vstr_append_n (attr_value -> value , lcase_letter , 1 );
217
+ } else {
218
+ vstr_append_n (attr_value -> value , buf , 1 );
219
+ };
220
+ buf ++ ;
221
+ }
222
+ return buf ;
223
+ };
224
+
225
+ struct vstr * handle_content_type (struct vstr * attr_value ) {
226
+ struct vstr * encoding ;
227
+ char * value ;
228
+ char * quote ;
229
+
230
+ encoding = vstr_new (8 );
231
+ value = attr_value -> str ;
232
+ //Skip characters up to and including the first ;
233
+ value = jump_to (value , ";" );
234
+ value ++ ;
235
+
236
+ if (* value == '\0' ) {
237
+ return encoding ;
238
+ }
239
+
240
+ skip_space (value );
241
+
242
+ if (strncmp (value , "charset" , 7 ) != 0 ) {
243
+ return encoding ;
244
+ }
245
+ value += 7 ;
246
+
247
+ skip_space (value );
248
+
249
+ if (* value != '=' ) {
250
+ return encoding ;
251
+ }
252
+
253
+ value ++ ;
254
+
255
+ skip_space (value );
256
+
257
+ if (* value == '\'' || * value == '"' ) {
258
+ quote = value ;
259
+ value ++ ;
260
+ if (strstr (value , quote ) != NULL ) {
261
+ while (value != quote ) {
262
+ vstr_append_n (encoding , value , 1 );
263
+ value ++ ;
264
+ }
265
+ return encoding ;
266
+ } else {
267
+ return encoding ;
268
+ }
269
+ } else {
270
+ while (* value != '\0' && isspace ((int )(* value )) == 0 ) {
271
+ vstr_append_n (encoding , value , 1 );
272
+ value ++ ;
273
+ }
274
+ return encoding ;
275
+ }
276
+ };
277
+
278
+ int is_encoding (struct vstr * encoding ) {
279
+ //Is the string a valid encoding?
280
+ //return 1;
281
+ return vstr_in_char_array (encoding , valid_encodings , sizeof (valid_encodings )/sizeof (char * ));
282
+ };
283
+
284
+ char * skip_space (char * buf ) {
285
+ int skip_chars = 0 ;
286
+ skip_chars = strspn (buf , "\t\n\f\v\f\r " );
287
+ buf += skip_chars ;
288
+ return buf ;
289
+ };
0 commit comments