eg/jedit/interchange.xml
[interchange.git] / lib / Vend / Parser.pm
1 # Vend::Parser - Interchange parser class
2 #
3 # $Id: Parser.pm,v 2.13 2007-08-09 13:40:53 pajamian Exp $
4 #
5 # Copyright (C) 2002-2007 Interchange Development Group
6 # Copyright (C) 1997-2002 Red Hat, Inc.
7 #
8 # Based on HTML::Parser
9 # Copyright 1996 Gisle Aas. All rights reserved.
10
11 =head1 NAME
12
13 Vend::Parser - Interchange parser class
14
15 =head1 DESCRIPTION
16
17 C<Vend::Parser> will tokenize a Interchange page when the $p->parse()
18 method is called. The document to parse can be supplied in arbitrary
19 chunks. Call $p->eof() the end of the document to flush any remaining
20 text. The return value from parse() is a reference to the parser object.
21
22 =over 4
23
24 =item $self->start($tag, $attr, $attrseq, $origtext)
25
26 This method is called when a complete start tag has been recognized.
27 The first argument is the tag name (in lower case) and the second
28 argument is a reference to a hash that contain all attributes found
29 within the start tag. The attribute keys are converted to lower case.
30 Entities found in the attribute values are already expanded. The
31 third argument is a reference to an array with the lower case
32 attribute keys in the original order. The fourth argument is the
33 original Interchange page.
34
35 =item $self->end($tag)
36
37 This method is called when an end tag has been recognized. The
38 argument is the lower case tag name.
39
40 =item $self->text($text)
41
42 This method is called when plain text in the document is recognized.
43 The text is passed on unmodified and might contain multiple lines.
44 Note that for efficiency reasons entities in the text are B<not>
45 expanded. 
46
47 =back
48
49 =head1 COPYRIGHT
50
51 Copyright 2002-2007 Interchange Development Group
52 Copyright 1997-2002 Red Hat, Inc.  
53 Original HTML::Parser module copyright 1996 Gisle Aas.
54
55 This library is free software; you can redistribute it and/or
56 modify it under the same terms as Perl itself.
57
58 =head1 AUTHORS
59
60 Vend::Parser - Mike Heins <mike@perusion.com>
61 HTML::Parser - Gisle Aas <aas@sn.no>
62
63 =cut
64
65 package Vend::Parser;
66
67 use strict;
68 no warnings qw(uninitialized numeric);
69
70 use HTML::Entities ();
71 use vars qw($VERSION);
72 $VERSION = substr(q$Revision: 2.13 $, 10);
73
74
75 sub new
76 {
77         my $class = shift;
78         my $self = bless { '_buf' => '' }, $class;
79         $self;
80 }
81
82
83 sub eof
84 {
85         shift->parse(undef);
86 }
87
88 sub parse
89 {
90         my $self = shift;
91         my $buf = \ $self->{_buf};
92         unless (defined $_[0]) {
93                 # signals EOF (assume rest is plain text)
94                 $self->text($$buf) if length $$buf;
95                 $$buf = '';
96                 return $self;
97         }
98         $$buf .= $_[0];
99
100         my $eaten;
101         # Parse html text in $$buf.  The strategy is to remove complete
102         # tokens from the beginning of $$buf until we can't deside whether
103         # it is a token or not, or the $$buf is empty.
104         while (1) {  # the loop will end by returning when text is parsed
105                 # If a preceding routine sent the response, stop 
106                 if ($Vend::Sent) {
107                         ${$self->{OUT}} = $self->{_buf} = '';
108                         @Vend::Output = ();
109                         return $self;
110                 }
111                 # We try to pull off any plain text (anything before a '[')
112                 if ($$buf =~ s/^([^[]+)// ) {
113 #my $eat = $1;
114 #::logDebug("plain eat='$eat'");
115 #$self->text($eat);
116                         $self->text($1);
117                         return $self unless length $$buf;
118                 # Find the most common tags
119                 } elsif ($$buf =~ s|^(\[([-a-z0-9A-Z_]+)[^"'=\]>]*\])||) {
120 #my $tag=$2; my $eat = $1;
121 #undef $self->{HTML};
122 #::logDebug("tag='$tag' eat='$eat'");
123 #$self->start($tag, {}, [], $eat);
124                                 undef $self->{HTML};
125                                 $self->start($2, {}, [], $1);
126                 # Then, finally we look for a start tag
127                 } elsif ($$buf =~ s|^\[||) {
128                         # start tag
129                         $eaten = '[';
130                         $self->{HTML} = 0 if ! defined $self->{HTML};
131 #::logDebug("do [ tag");
132
133                         # First find a tag name. It must immediately follow the
134                         # opening '[', then start with a letter, and be followed by
135                         # letters, numbers, dot, or underscore.
136                         if ($$buf =~ s|^(([a-zA-Z][-a-zA-Z0-9._]*)\s*)||) {
137                                 $eaten .= $1;
138
139                                 my ($tag);
140                                 my ($nopush, $element);
141                                 my %attr;
142                                 my @attrseq;
143                                 my $old;
144
145                                 $tag = lc $2;
146 #::logDebug("tag='$tag' eat='$eaten'");
147
148                                 # Then we would like to find some attributes
149                                 while ( $$buf =~ s|^(([_a-zA-Z][-a-zA-Z0-9._]*)\s*)|| or
150                                                 $$buf =~ s|^(([=!<>][=~]?)\s+)||                 )
151                                 {
152                                         $eaten .= $1;
153                                         my $attr = lc $2;
154                                         $attr =~ tr/-/_/;
155 #::logDebug("in parse, eaten=$eaten");
156                                         $attr =~ s/\.(.*)//
157                                                 and $element = $1;
158                                                 
159                                         my $val;
160                                         
161                                         # The attribute might take an optional value.
162                                         # First we check for an unquoted value
163                                         if ($$buf =~ s~(^=\s*([^\|\"\'\`\]\s][^\]>\s]*)\s*)~~) {
164                                                 $eaten .= $1;
165                                                 next unless defined $attr;
166                                                 $val = $2;
167                                         # or quoted by " or '
168                                         } elsif ($$buf =~ s~(^=\s*(["\'])(.*?)\2\s*)~~s) {
169                                                 $eaten .= $1;
170                                                 next unless defined $attr;
171                                                 $val = $3;
172                                                 HTML::Entities::decode($val) if $attr{entities};
173                                         } elsif ($$buf =~ s~(^=\s*([\`\|])(.*?)\2\s*)~~s) {
174                                                 $eaten .= $1;
175                                                 # or quoted by ` to send to [calc]
176                                                 if    ($2 eq '`') {
177                                                         $val = Vend::Interpolate::tag_calc($3)
178                                                                 unless defined $Vend::Cfg->{AdminSub}{calc};
179                                                 }
180                                                 # or quoted by | to strip leading & trailing whitespace
181                                                 elsif ($2 eq '|') {
182                                                                 $val = $3;
183                                                                 $val =~ s/^\s+//;
184                                                                 $val =~ s/\s+$//;
185                                                 }
186                                                 else {
187                                                         die "parse error!";
188                                                 }
189                                         # truncated just after the '=' or inside the attribute
190                                         } elsif ($$buf =~ m|^(=\s*)$|s or
191                                                          $$buf =~ m|^(=\s*[\"\'].*)|s) {
192                                                 $$buf = "$eaten$1";
193                                                 return $self;
194                                         } elsif (!$old) {
195                                                 # assume attribute with implicit value, but if not,
196                                                 # no value is set and the eaten value is grown
197                                                 undef $nopush;
198                                                 ($attr,$val,$nopush) = $self->implicit($tag,$attr);
199                                                 $old = 1 unless $val;
200
201                                         }
202                                         next if $old;
203                                         if(! $attr) {
204                                                 $attr->{OLD} = $val if defined $attr;
205                                                 next;
206                                         }
207                                         if(defined $element) {
208 #::logDebug("Found element: $element val=$val");
209                                                 $val = Vend::Interpolate::interpolate_html($val)
210                                                         if  $::Pragma->{interpolate_itl_references}
211                                                         and $val =~ /\[\w[-\w]*\s+.*]/s;
212                                                 if(! ref $attr{$attr}) {
213                                                         if ($element =~ /[A-Za-z]/) {
214                                                                 $attr{$attr} = { $element => $val };
215                                                         }
216                                                         else {
217                                                                 $attr{$attr} = [ ];
218                                                                 $attr{$attr}->[$element] = $val;
219                                                         }
220                                                         push (@attrseq, $attr);
221                                                 }
222                                                 elsif(ref($attr{$attr}) eq 'ARRAY') {
223                                                         if($element =~ /\D/) {
224                                                                 push @{$attr{$attr}}, $val;
225                                                         }
226                                                         else {
227                                                                 $attr{$attr}->[$element] = $val;
228                                                         }
229                                                 }
230                                                 elsif (ref($attr{$attr}) eq 'HASH') {
231                                                         $attr{$attr}->{$element} = $val;
232                                                 }
233                                                 undef $element;
234                                                 next;
235                                         }
236                                         $attr{$attr} = $val;
237                                         push(@attrseq, $attr) unless $nopush;
238                                 }
239
240                                 # At the end there should be a closing ']'
241                                 if ($$buf =~ s|^\]|| ) {
242                                         $self->start($tag, \%attr, \@attrseq, "$eaten]");
243                                 } elsif ($$buf =~ s|^/\s*\]||) {
244                                         # XML-style empty container tag like [this /]
245                                         $self->start($tag, \%attr, \@attrseq, "$eaten]", 1);
246                                 } elsif ($$buf =~ s|^([^\]\n]+\])||) {
247                                         $eaten .= $1;
248                                         $self->start($tag, {}, [], $eaten);
249                                 } else {
250 #::logDebug("eaten $eaten");
251                                         # Not a conforming start tag, regard it as normal text
252                                         $self->text($eaten);
253                                 }
254
255                         } else {
256 #::logDebug("eaten $eaten");
257                                 $self->text($eaten);
258                         }
259                 } elsif (length $$buf) {
260                         ::logDebug("remaining: $$buf");
261                         die $$buf; # This should never happen
262                 } else {
263                         # The buffer is empty now
264                         return $self;
265                 }
266                 return $self if $self->{SEND};
267         }
268         $self;
269 }
270
271
272 1;
273 __END__