1 # Vend::Parser - Interchange parser class
3 # $Id: Parser.pm,v 2.13 2007-08-09 13:40:53 pajamian Exp $
5 # Copyright (C) 2002-2007 Interchange Development Group
6 # Copyright (C) 1997-2002 Red Hat, Inc.
8 # Based on HTML::Parser
9 # Copyright 1996 Gisle Aas. All rights reserved.
13 Vend::Parser - Interchange parser class
17 C<Vend::Parser> will tokenize a Interchange page when the $p->parse()
18 method is called. The document to parse can be supplied in arbitrary
19 chunks. Call $p->eof() the end of the document to flush any remaining
20 text. The return value from parse() is a reference to the parser object.
24 =item $self->start($tag, $attr, $attrseq, $origtext)
26 This method is called when a complete start tag has been recognized.
27 The first argument is the tag name (in lower case) and the second
28 argument is a reference to a hash that contain all attributes found
29 within the start tag. The attribute keys are converted to lower case.
30 Entities found in the attribute values are already expanded. The
31 third argument is a reference to an array with the lower case
32 attribute keys in the original order. The fourth argument is the
33 original Interchange page.
35 =item $self->end($tag)
37 This method is called when an end tag has been recognized. The
38 argument is the lower case tag name.
40 =item $self->text($text)
42 This method is called when plain text in the document is recognized.
43 The text is passed on unmodified and might contain multiple lines.
44 Note that for efficiency reasons entities in the text are B<not>
51 Copyright 2002-2007 Interchange Development Group
52 Copyright 1997-2002 Red Hat, Inc.
53 Original HTML::Parser module copyright 1996 Gisle Aas.
55 This library is free software; you can redistribute it and/or
56 modify it under the same terms as Perl itself.
60 Vend::Parser - Mike Heins <mike@perusion.com>
61 HTML::Parser - Gisle Aas <aas@sn.no>
68 no warnings qw(uninitialized numeric);
70 use HTML::Entities ();
71 use vars qw($VERSION);
72 $VERSION = substr(q$Revision: 2.13 $, 10);
78 my $self = bless { '_buf' => '' }, $class;
91 my $buf = \ $self->{_buf};
92 unless (defined $_[0]) {
93 # signals EOF (assume rest is plain text)
94 $self->text($$buf) if length $$buf;
101 # Parse html text in $$buf. The strategy is to remove complete
102 # tokens from the beginning of $$buf until we can't deside whether
103 # it is a token or not, or the $$buf is empty.
104 while (1) { # the loop will end by returning when text is parsed
105 # If a preceding routine sent the response, stop
107 ${$self->{OUT}} = $self->{_buf} = '';
111 # We try to pull off any plain text (anything before a '[')
112 if ($$buf =~ s/^([^[]+)// ) {
114 #::logDebug("plain eat='$eat'");
117 return $self unless length $$buf;
118 # Find the most common tags
119 } elsif ($$buf =~ s|^(\[([-a-z0-9A-Z_]+)[^"'=\]>]*\])||) {
120 #my $tag=$2; my $eat = $1;
121 #undef $self->{HTML};
122 #::logDebug("tag='$tag' eat='$eat'");
123 #$self->start($tag, {}, [], $eat);
125 $self->start($2, {}, [], $1);
126 # Then, finally we look for a start tag
127 } elsif ($$buf =~ s|^\[||) {
130 $self->{HTML} = 0 if ! defined $self->{HTML};
131 #::logDebug("do [ tag");
133 # First find a tag name. It must immediately follow the
134 # opening '[', then start with a letter, and be followed by
135 # letters, numbers, dot, or underscore.
136 if ($$buf =~ s|^(([a-zA-Z][-a-zA-Z0-9._]*)\s*)||) {
140 my ($nopush, $element);
146 #::logDebug("tag='$tag' eat='$eaten'");
148 # Then we would like to find some attributes
149 while ( $$buf =~ s|^(([_a-zA-Z][-a-zA-Z0-9._]*)\s*)|| or
150 $$buf =~ s|^(([=!<>][=~]?)\s+)|| )
155 #::logDebug("in parse, eaten=$eaten");
161 # The attribute might take an optional value.
162 # First we check for an unquoted value
163 if ($$buf =~ s~(^=\s*([^\|\"\'\`\]\s][^\]>\s]*)\s*)~~) {
165 next unless defined $attr;
167 # or quoted by " or '
168 } elsif ($$buf =~ s~(^=\s*(["\'])(.*?)\2\s*)~~s) {
170 next unless defined $attr;
172 HTML::Entities::decode($val) if $attr{entities};
173 } elsif ($$buf =~ s~(^=\s*([\`\|])(.*?)\2\s*)~~s) {
175 # or quoted by ` to send to [calc]
177 $val = Vend::Interpolate::tag_calc($3)
178 unless defined $Vend::Cfg->{AdminSub}{calc};
180 # or quoted by | to strip leading & trailing whitespace
189 # truncated just after the '=' or inside the attribute
190 } elsif ($$buf =~ m|^(=\s*)$|s or
191 $$buf =~ m|^(=\s*[\"\'].*)|s) {
195 # assume attribute with implicit value, but if not,
196 # no value is set and the eaten value is grown
198 ($attr,$val,$nopush) = $self->implicit($tag,$attr);
199 $old = 1 unless $val;
204 $attr->{OLD} = $val if defined $attr;
207 if(defined $element) {
208 #::logDebug("Found element: $element val=$val");
209 $val = Vend::Interpolate::interpolate_html($val)
210 if $::Pragma->{interpolate_itl_references}
211 and $val =~ /\[\w[-\w]*\s+.*]/s;
212 if(! ref $attr{$attr}) {
213 if ($element =~ /[A-Za-z]/) {
214 $attr{$attr} = { $element => $val };
218 $attr{$attr}->[$element] = $val;
220 push (@attrseq, $attr);
222 elsif(ref($attr{$attr}) eq 'ARRAY') {
223 if($element =~ /\D/) {
224 push @{$attr{$attr}}, $val;
227 $attr{$attr}->[$element] = $val;
230 elsif (ref($attr{$attr}) eq 'HASH') {
231 $attr{$attr}->{$element} = $val;
237 push(@attrseq, $attr) unless $nopush;
240 # At the end there should be a closing ']'
241 if ($$buf =~ s|^\]|| ) {
242 $self->start($tag, \%attr, \@attrseq, "$eaten]");
243 } elsif ($$buf =~ s|^/\s*\]||) {
244 # XML-style empty container tag like [this /]
245 $self->start($tag, \%attr, \@attrseq, "$eaten]", 1);
246 } elsif ($$buf =~ s|^([^\]\n]+\])||) {
248 $self->start($tag, {}, [], $eaten);
250 #::logDebug("eaten $eaten");
251 # Not a conforming start tag, regard it as normal text
256 #::logDebug("eaten $eaten");
259 } elsif (length $$buf) {
260 ::logDebug("remaining: $$buf");
261 die $$buf; # This should never happen
263 # The buffer is empty now
266 return $self if $self->{SEND};