htmllib._pmod
Go to the documentation of this file.
1 /* Copyright (C) 2000-2004 Thomas Bopp, Thorsten Hampel, Ludger Merkens
2  *
3  * This program is free software; you can redistribute it and/or modify
4  * it under the terms of the GNU General Public License as published by
5  * the Free Software Foundation; either version 2 of the License, or
6  * (at your option) any later version.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11  * GNU General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
16  *
17  * $Id: htmllib.pmod,v 1.1 2008/03/31 13:39:57 exodusd Exp $
18  */
19  inherit "AbstractCallbacks";
20 #include <classes.h>
21 #include <macros.h>
22 #include <database.h>
23 #include <attributes.h>
24 
25 
26 
27 
28 //#define KEEP_UTF // keep the UTF8 converted encoding for output
29 
30 import httplib;
31 
32 string ahref_link_navigate(object obj, void|string prefix)
33 {
34  if ( !stringp(prefix) ) prefix = "";
35  return "<a "+href_link_navigate(obj)+">"+prefix+obj->get_identifier()+
36  "</a>";
37 }
38 
39 string href_link_navigate_postfix(object obj, string prefix, string postfix)
40 {
41  string path;
42  string href;
43  object dest = obj;
44 
45  if (!stringp(prefix)) prefix="";
46  if (!stringp(postfix))postfix="";
47 
48  if ( obj->get_object_class() & CLASS_EXIT ) {
49  dest = obj->get_exit();
50  path = get_module("filepath:tree")->object_to_filename(dest);
51  href = "href=\""+path+postfix+"\"";
52  }
53  else
54  href = "href=\""+prefix+replace_uml(obj->get_identifier())+postfix+"\"";
55  return href;
56 }
57 
58 string href_link_navigate(object obj, void|string prefix)
59 {
60  string path;
61  string href;
62  object dest = obj;
63 
64  if ( !stringp(prefix) ) prefix = "";
65 
66  if ( obj->get_object_class() & CLASS_EXIT ) {
67  dest = obj->get_exit();
68  path = get_module("filepath:tree")->object_to_filename(dest);
69  href = "href=\""+path+"\"";
70  }
71  else
72  href = "href=\""+prefix+replace_uml(obj->get_identifier())+"\"";
73  return href;
74 }
75 
76 string create_tag(string name, mapping attrs)
77 {
78  string attr_string = "";
79  foreach(indices(attrs), string a) {
80  attr_string += " " + a + "=\""+attrs[a]+"\"";
81  }
82  return sprintf("<%s%s>", name, attr_string);
83 }
84 
85 class rxmlHandler
86 {
87 public:
88 
89  string output = ""; // the output
90  mapping rxml_handlers = ([ ]);
91  mapping rxml_attributes = ([ ]);
92  mapping variables = ([ ]);
93  string encoding = "utf-8";
94  int scriptmode = 0;
95 
96  ADT.Queue NodeDataQueue = ADT.Queue();
97 
98  void create(mapping vars) {
99  variables = vars;
100  }
101 
102 private:
103  int store_data(string data) {
104  string node_data = NodeDataQueue->read();
105  if ( stringp(node_data) ) {
106  node_data += data;
107  NodeDataQueue->write(node_data);
108  return 1;
109  }
110  return 0;
111  }
112 
113 public:
114 
115  void startDocumentSAX(object parser, void|mixed userData) {
116  output = "<!-- sTeam link consistency and HTML extension parser - modified document view !-->\n";
117  }
118 
119  void startElementSAX(object parser, string name,
120  mapping(string:string) attrs, void|mixed userData)
121  {
122  if ( name == "script" )
123  scriptmode = 1;
124 
125  if ( !rxml_handlers[name] ) {
126  string attr_string = "";
127  if ( mappingp(attrs) ) {
128  foreach(indices(attrs), string a) {
129 
130  attr_string += " " + a + "=\""+attrs[a]+"\"";
131  }
132  }
133  string tagstr = "<"+name+ attr_string + ">";
134 
135 private:
136  if ( !store_data(tagstr) )
137  output += tagstr;
138  }
139  else {
140  rxml_attributes[name] = attrs;
141  NodeDataQueue->write(""); // if is empty string then fill
142  }
143 
144 public:
145  }
146 protected:
147  string call_handler(function f, mapping attributes, string data)
148  {
149  mapping params = variables;
150  if ( !mappingp(attributes) )
151  attributes = ([ ]);
152 
153  params->args = attributes;
154  params->args->body = data;
155  string result;
156  mixed err = catch(result=f(params));
157  if ( err ) {
158  FATAL("SAX: error calling handler %s\n%O", err[0], err[1]);
159  result = "<!-- error calling handler -->";
160  }
161  return result;
162  }
163 
164 public:
165 
166  void endElementSAX(object parser, string name, void|mixed userData)
167  {
168  string tagstr;
169 
170  if ( name == "script" )
171  scriptmode = 0;
172  function hfunc = rxml_handlers[name];
173  mapping attr = rxml_attributes[name];
174 
175  if ( functionp(hfunc) ) {
176  tagstr = call_handler(hfunc, attr, NodeDataQueue->read());
177 private:
178  if ( !store_data(tagstr) )
179  output += tagstr;
180  }
181  else if ( lower_case(name) != "br" )
182  {
183  tagstr = "</"+name+">";
184 private:
185  if ( !store_data(tagstr) )
186  output += tagstr;
187  }
188  }
189 
190  void errorSAX(object parser, string msg, void|mixed userData) {
191  output += "<!-- SAX: " + msg + "-->\n";
192  }
193 
194 public:
195  void cdataBlockSAX(object parser, string value, void|mixed userData)
196  {
197  if ( !scriptmode )
198  value = replace(value, ({ "<", ">", }), ({ "&lt;", "&gt;" }));
199 private:
200  if ( !store_data(value) )
201  output += value;
202  }
203  void charactersSAX(object parser, string chars, void|mixed userData)
204  {
205  if ( !scriptmode )
206  chars = replace(chars, ({ "<", ">", }), ({ "&lt;", "&gt;" }));
207 
208 private:
209  if ( !store_data(chars) )
210  output += chars;
211  }
212  void commentSAX(object parser, string value, void|mixed userData)
213  {
214  output += "<!--"+value+"-->\n";
215  }
216 
217 public:
218  void referenceSAX(object parser, string name, void|mixed userData)
219  {
220  werror("referenceSAX(%s)\n", name);
221  output += name;
222  }
223  void entityDeclSAX(object parser, string name, int type, string publicId,
224  string systemId, string content, void|mixed userData)
225  {
226  werror("entityDecl(%s)\n", name);
227  output +=name;
228  }
229  void notationDeclSAX(object parser, string name, string publicId,
230  string systemId, void|mixed userData)
231  {
232  werror("notationDecl(%s)\n", name);
233  }
234  void unparsedEntityDeclSAX(object parser, string name, string publicId,
235  string systemId, string notationName,
236  void|mixed userData)
237  {
238  werror("unparsedEntityDecl(%s)\n", name);
239  }
240  string getEntitySAX(object parser, string name, void|mixed userData)
241  {
242  werror("getEntitySax(%s)\n", name);
243  }
244  void attributeDeclSAX(object parser, string elem, string fullname,
245  int type, int def, void|mixed userData)
246  {
247  werror("attributeDeclSAX(%s, %s)\n", elem, fullname);
248  }
249  void internalSubsetSAX(object parser, string name, string externalID,
250  string systemID, void|mixed uData)
251  {
252  }
253  void ignorableWhitespaceSAX(object parser, string chars, void|mixed uData)
254  {
255  }
256 
257  void set_handlers(mapping h)
258  {
259  rxml_handlers = h;
260  }
261  string get_result()
262  {
263  return output;
264  }
265 }
266 
267 string get_tag_name(object tag)
268 {
269  string name = tag->get_identifier();
270  sscanf(name, "%s.pike", name);
271  return name;
272 }
273 
274 function get_tag_function(object tag)
275 {
276  object instance;
277  if ( !objectp(tag) )
278  return 0;
279  catch(instance = tag->provide_instance());
280  if ( !objectp(instance) )
281  return 0;
282 
283  return instance->execute;
284 }
285 
286 
287 
288 string parse_rxml(string|object html, mapping variables, mapping tags, string|void encoding)
289 {
290  object cb = rxmlHandler(variables);
291  string inp;
292 
293  cb->set_handlers(tags);
294  if ( objectp(html) ) {
295  encoding = html->query_attribute(DOC_ENCODING);
296  }
297  else if ( !stringp(encoding) )
298  encoding = detect_encoding(html);
299 
300  encoding = lower_case(encoding);
301 
302  inp = html;
303  if ( stringp(inp) && strlen(inp) == 0 )
304  return "";
305 
306  object sax = xml.HTML(inp, cb, ([ ]), 0, stringp(html));
307  sax->parse(encoding);
308  string res = cb->get_result();
309 #ifndef KEEP_UTF
310  // now it IS utf8 - change back to former encoding
311  if ( stringp(encoding) && encoding != "utf-8" ) {
312  if ( catch(res = xml.utf8_to_html(res)) ) {
313  werror("HTML Conversion failed !\n");
314  if ( encoding == "iso-8859-1" ) {
315  if ( catch(res = xml.utf8_to_isolat1(res)) ) {
316  werror("Failed conversion - skipping rxml !\n");
317  return html;
318  }
319  }
320  else {
321  werror("Failed conversion - skipping !\n");
322  return html; // do nothing
323  }
324  }
325  }
326 #endif
327  return res;
328 }
329 
330 
331 class testTag {
332 public:
333  string execute(mapping vars) {
334  return "Hello World to " + vars->args->name;
335  }
336 }
337 
338 class tagTag {
339 public:
340  string execute(mapping vars) {
341  return "<BODY>"+vars->args->body+"</BODY>";
342  }
343 }
344 
345 {
346  // first test rxml
347  string result =
348  "<html><body>Welcome! <h2><test name='test'/></h2></body></html>";
349 
350  result = parse_rxml(result, ([ ]), ([ "test": testTag()->execute, ]));
351  if ( result !=
352  "<html><body>Welcome! <h2>Hello World to test</h2></body></html>" )
353  error("rxml test failed - wrong result " + result);
354 
355  result = "<a><b>&lt;c&gt;<c apply='1'>"+
356  "<d name='x'/></c>"+
357  "<d name='y'/></b></a>";
358 
359  result = parse_rxml(result, ([ ]), ([ "d": testTag()->execute,
360  "c":tagTag()->execute,]));
361  if ( result !=
362  "<a><b><c><BODY>Hello World to x</BODY>Hello World to y</b></a>" )
363  error("nested rxml test failed !");
364 }
365 
366 function find_tag(string name)
367 {
368  object tags = OBJ("/tags");
369  if ( !objectp(tags) )
370  return 0;
371  object tag = tags->get_object_byname(name+".pike");
372  if ( !objectp(tag) )
373  return 0;
374  return get_tag_function(tag);
375 }
376 
377 
378 mapping find_tags(object obj)
379 {
380  if ( !objectp(obj) )
381  return 0;
382  if ( obj->get_object_class() & CLASS_CONTAINER ) {
383  mapping result = ([ ]);
384  foreach(obj->get_inventory_by_class(CLASS_DOCLPC), object tag) {
385  function f = get_tag_function(tag);
386  string tagname = get_tag_name(tag);
387  if ( !functionp(f) )
388  FATAL("Warning - no tag function for tag: %s", tagname);
389  else
390  result[tagname] = f;
391  }
392  return result;
393  }
394  else if ( obj->get_object_class() & CLASS_DOCXSL) {
395  object env = obj->get_environment();
396  if ( objectp(env) )
397  return find_tags(env->get_object_byname("tags"));
398  }
399  return 0;
400 }
401 
402 
403 /**
404  * Replace XML entities (&lt; &gt; &amp;)
405  * with simple characters (< > &).
406  *
407  * @param str the string to replace
408  * @return a string without quoted characters
409  */
410 string unquote_xml ( string str )
411 {
412  return replace( str, ({ "&lt;", "&gt;", "&amp;" }), ({ "<", ">", "&" }) );
413 }
414 
415 
416 /**
417  * Replace problematic characters (< > &)
418  * with XML entities (&lt; &gt; &amp;).
419  *
420  * @param str the string to replace
421  * @return a string with problematic characters quoted
422  */
423 string quote_xml ( string str )
424 {
425  return replace( str, ({ "<", ">", "&" }), ({ "&lt;", "&gt;", "&amp;" }) );
426 }
427 
428 
429 /**
430  * Replace HTML entities with umlauts, <, >, & etc.
431  * This method was taken from Pike Protocols.HTTP.unentity() and reversed.
432  *
433  * @param str the string to replace
434  * @return a utf-8 string without quoted characters
435  */
436 string unquote_html ( string str )
437 {
438  return replace( str,
439 
440  ({ "&AElig;", "&Aacute;", "&Acirc;", "&Agrave;", "&Aring;", "&Atilde;",
441  "&Auml;", "&Ccedil;", "&ETH;", "&Eacute;", "&Ecirc;", "&Egrave;",
442  "&Euml;", "&Iacute;", "&Icirc;", "&Igrave;", "&Iuml;", "&Ntilde;",
443  "&Oacute;", "&Ocirc;", "&Ograve;", "&Oslash;", "&Otilde;", "&Ouml;",
444  "&THORN;", "&Uacute;", "&Ucirc;", "&Ugrave;", "&Uuml;", "&Yacute;",
445  "&aacute;", "&acirc;", "&aelig;", "&agrave;", "&apos;", "&aring;",
446  "&ast;", "&atilde;", "&auml;", "&brvbar;", "&ccedil;", "&cent;",
447  "&colon;", "&comma;", "&commat;", "&copy;", "&deg;", "&dollar;",
448  "&eacute;", "&ecirc;", "&egrave;", "&emsp;", "&ensp;", "&equals;",
449  "&eth;", "&euml;", "&excl;", "&frac12;", "&frac14;", "&frac34;",
450  "&frac18;", "&frac38;", "&frac58;", "&frac78;", "&gt;", "&gt",
451  "&half;", "&hyphen;", "&iacute;", "&icirc;", "&iexcl;", "&igrave;",
452  "&iquest;", "&iuml;", "&laquo;", "&lpar;", "&lsqb;", "&lt;",
453  "&lt", "&mdash;", "&micro;", "&middot;", "&nbsp;", "&ndash;",
454  "&not;", "&ntilde;", "&oacute;", "&ocirc;", "&ograve;", "&oslash;",
455  "&otilde;", "&ouml;", "&para;", "&percnt;", "&period;", "&plus;",
456  "&plusmn;", "&pound;", "&quest;", "&quot;", "&raquo;", "&reg;",
457  "&rpar;", "&rsqb;", "&sect;", "&semi;", "&shy;", "&sup1;",
458  "&sup2;", "&sup3;", "&szlig;", "&thorn;", "&tilde;", "&trade;",
459  "&uacute;", "&ucirc;", "&ugrave;", "&uuml;", "&yacute;", "&yen;",
460  "&yuml;", "&verbar;", "&amp;", "&#34;", "&#39;", "&#0;", "&#58;" }),
461 
462  ({ "?", "¡", "¬", "¿", "?", "?",
463  "?", "«", "?", "?", " ", "»",
464  "À", "Õ", "?", "Ã", "?", "?",
465  "?", "?", "?", "ÿ", "?", "÷",
466  "?", "?", "?", "?", "?", "?",
467  "·", "?", "Ê", "?", "&apos;", "Â",
468  "&ast;", "?", "?", "¶", "Á", "¢",
469  ":", ",", "&commat;", "©", "?", "$",
470  "È", "Í", "Ë", "&emsp;", "&ensp;", "&equals;",
471  "?", "Î", "!", "?", "º", "æ",
472  "&frac18;", "&frac38;", "&frac58;", "&frac78;", ">", ">",
473  "&half;", "&hyphen;", "Ì", "Ó", "°", "Ï",
474  "ø", "Ô", "´", "(", "&lsqb;", "<",
475  "<", "&mdash;", "µ", "?", "", "&ndash;",
476  "¨", "Ò", "Û", "Ù", "Ú", "¯",
477  "?", "?", "?", "%", ".", "+",
478  "±", "£", "?", "\"", "ª", "Æ",
479  ")", "&rsqb;", "ß", "&semi;", "?", "?",
480  "?", "?", "?", "?", "~", "&trade;",
481  "?", "?", "?", "¸", "?", "?",
482  "?", "&verbar;", "&", "\"", "\'", "\000", ":" }),
483 
484  );
485 }
486 
487 
488 /**
489  * Replace umlauts, <, >, & etc. with HTML entities.
490  * This method was taken from Pike Protocols.HTTP.unentity() and reversed.
491  *
492  * @param str the string to replace (utf-8 encoding expected)
493  * @return a string with problematic characters quoted to html entities
494  */
495 string quote_html ( string str )
496 {
497  return replace( str,
498 
499  ({ "?", "¡", "¬", "¿", "?", "?",
500  "?", "«", "?", "?", " ", "»",
501  "À", "Õ", "?", "Ã", "?", "?",
502  "?", "?", "?", "ÿ", "?", "÷",
503  "?", "?", "?", "?", "?", "?",
504  "·", "?", "Ê", "?", "&apos;", "Â",
505  "&ast;", "?", "?", "¶", "Á", "¢",
506  ":", ",", "&commat;", "©", "?", "$",
507  "È", "Í", "Ë", "&emsp;", "&ensp;", "&equals;",
508  "?", "Î", "!", "?", "º", "æ",
509  "&frac18;", "&frac38;", "&frac58;", "&frac78;", ">", ">",
510  "&half;", "&hyphen;", "Ì", "Ó", "°", "Ï",
511  "ø", "Ô", "´", "(", "&lsqb;", "<",
512  "<", "&mdash;", "µ", "?", "", "&ndash;",
513  "¨", "Ò", "Û", "Ù", "Ú", "¯",
514  "?", "?", "?", "%", ".", "+",
515  "±", "£", "?", "\"", "ª", "Æ",
516  ")", "&rsqb;", "ß", "&semi;", "?", "?",
517  "?", "?", "?", "?", "~", "&trade;",
518  "?", "?", "?", "¸", "?", "?",
519  "?", "&verbar;", "&", "\"", "\'", "\000", ":" }),
520 
521  ({ "&AElig;", "&Aacute;", "&Acirc;", "&Agrave;", "&Aring;", "&Atilde;",
522  "&Auml;", "&Ccedil;", "&ETH;", "&Eacute;", "&Ecirc;", "&Egrave;",
523  "&Euml;", "&Iacute;", "&Icirc;", "&Igrave;", "&Iuml;", "&Ntilde;",
524  "&Oacute;", "&Ocirc;", "&Ograve;", "&Oslash;", "&Otilde;", "&Ouml;",
525  "&THORN;", "&Uacute;", "&Ucirc;", "&Ugrave;", "&Uuml;", "&Yacute;",
526  "&aacute;", "&acirc;", "&aelig;", "&agrave;", "&apos;", "&aring;",
527  "&ast;", "&atilde;", "&auml;", "&brvbar;", "&ccedil;", "&cent;",
528  "&colon;", "&comma;", "&commat;", "&copy;", "&deg;", "&dollar;",
529  "&eacute;", "&ecirc;", "&egrave;", "&emsp;", "&ensp;", "&equals;",
530  "&eth;", "&euml;", "&excl;", "&frac12;", "&frac14;", "&frac34;",
531  "&frac18;", "&frac38;", "&frac58;", "&frac78;", "&gt;", "&gt",
532  "&half;", "&hyphen;", "&iacute;", "&icirc;", "&iexcl;", "&igrave;",
533  "&iquest;", "&iuml;", "&laquo;", "&lpar;", "&lsqb;", "&lt;",
534  "&lt", "&mdash;", "&micro;", "&middot;", "&nbsp;", "&ndash;",
535  "&not;", "&ntilde;", "&oacute;", "&ocirc;", "&ograve;", "&oslash;",
536  "&otilde;", "&ouml;", "&para;", "&percnt;", "&period;", "&plus;",
537  "&plusmn;", "&pound;", "&quest;", "&quot;", "&raquo;", "&reg;",
538  "&rpar;", "&rsqb;", "&sect;", "&semi;", "&shy;", "&sup1;",
539  "&sup2;", "&sup3;", "&szlig;", "&thorn;", "&tilde;", "&trade;",
540  "&uacute;", "&ucirc;", "&ugrave;", "&uuml;", "&yacute;", "&yen;",
541  "&yuml;", "&verbar;", "&amp;", "&#34;", "&#39;", "&#0;", "&#58;" }),
542 
543  );
544 }
545 
546 
547 string describe() { return "htmllib"; }
548 
549 
550 
551 };