diff options
author | Jocelyn Turcotte <jocelyn.turcotte@digia.com> | 2014-08-08 14:30:41 +0200 |
---|---|---|
committer | Jocelyn Turcotte <jocelyn.turcotte@digia.com> | 2014-08-12 13:49:54 +0200 |
commit | ab0a50979b9eb4dfa3320eff7e187e41efedf7a9 (patch) | |
tree | 498dfb8a97ff3361a9f7486863a52bb4e26bb898 /chromium/third_party/cygwin/lib/perl5/vendor_perl/5.10/i686-cygwin/HTML | |
parent | 4ce69f7403811819800e7c5ae1318b2647e778d1 (diff) |
Update Chromium to beta version 37.0.2062.68
Change-Id: I188e3b5aff1bec75566014291b654eb19f5bc8ca
Reviewed-by: Andras Becsi <andras.becsi@digia.com>
Diffstat (limited to 'chromium/third_party/cygwin/lib/perl5/vendor_perl/5.10/i686-cygwin/HTML')
7 files changed, 0 insertions, 2864 deletions
diff --git a/chromium/third_party/cygwin/lib/perl5/vendor_perl/5.10/i686-cygwin/HTML/Entities.pm b/chromium/third_party/cygwin/lib/perl5/vendor_perl/5.10/i686-cygwin/HTML/Entities.pm deleted file mode 100644 index 1e7dfc1f069..00000000000 --- a/chromium/third_party/cygwin/lib/perl5/vendor_perl/5.10/i686-cygwin/HTML/Entities.pm +++ /dev/null @@ -1,491 +0,0 @@ -package HTML::Entities; - -# $Id: Entities.pm,v 1.35 2006/03/22 09:15:23 gisle Exp $ - -=head1 NAME - -HTML::Entities - Encode or decode strings with HTML entities - -=head1 SYNOPSIS - - use HTML::Entities; - - $a = "Våre norske tegn bør æres"; - decode_entities($a); - encode_entities($a, "\200-\377"); - -For example, this: - - $input = "vis-à-vis Beyoncé's naïve\npapier-mâché résumé"; - print encode_entities($input), "\n" - -Prints this out: - - vis-à-vis Beyoncé's naïve - papier-mâché résumé - -=head1 DESCRIPTION - -This module deals with encoding and decoding of strings with HTML -character entities. The module provides the following functions: - -=over 4 - -=item decode_entities( $string, ... ) - -This routine replaces HTML entities found in the $string with the -corresponding Unicode character. Under perl 5.6 and earlier only -characters in the Latin-1 range are replaced. Unrecognized -entities are left alone. - -If multiple strings are provided as argument they are each decoded -separately and the same number of strings are returned. - -If called in void context the arguments are decoded in-place. - -This routine is exported by default. - -=item _decode_entities( $string, \%entity2char ) - -=item _decode_entities( $string, \%entity2char, $expand_prefix ) - -This will in-place replace HTML entities in $string. The %entity2char -hash must be provided. Named entities not found in the %entity2char -hash are left alone. Numeric entities are expanded unless their value -overflow. - -The keys in %entity2char are the entity names to be expanded and their -values are what they should expand into. The values do not have to be -single character strings. If a key has ";" as suffix, -then occurrences in $string are only expanded if properly terminated -with ";". Entities without ";" will be expanded regardless of how -they are terminated for compatiblity with how common browsers treat -entities in the Latin-1 range. - -If $expand_prefix is TRUE then entities without trailing ";" in -%entity2char will even be expanded as a prefix of a longer -unrecognized name. The longest matching name in %entity2char will be -used. This is mainly present for compatibility with an MSIE -misfeature. - - $string = "foo bar"; - _decode_entities($string, { nb => "@", nbsp => "\xA0" }, 1); - print $string; # will print "foo bar" - -This routine is exported by default. - -=item encode_entities( $string ) - -=item encode_entities( $string, $unsafe_chars ) - -This routine replaces unsafe characters in $string with their entity -representation. A second argument can be given to specify which -characters to consider unsafe (i.e., which to escape). The default set -of characters to encode are control chars, high-bit chars, and the -C<< < >>, C<< & >>, C<< > >>, C<< ' >> and C<< " >> -characters. But this, for example, would encode I<just> the -C<< < >>, C<< & >>, C<< > >>, and C<< " >> characters: - - $encoded = encode_entities($input, '<>&"'); - -This routine is exported by default. - -=item encode_entities_numeric( $string ) - -=item encode_entities_numeric( $string, $unsafe_chars ) - -This routine works just like encode_entities, except that the replacement -entities are always C<&#xI<hexnum>;> and never C<&I<entname>;>. For -example, C<encode_entities("r\xF4le")> returns "rôle", but -C<encode_entities_numeric("r\xF4le")> returns "rôle". - -This routine is I<not> exported by default. But you can always -export it with C<use HTML::Entities qw(encode_entities_numeric);> -or even C<use HTML::Entities qw(:DEFAULT encode_entities_numeric);> - -=back - -All these routines modify the string passed as the first argument, if -called in a void context. In scalar and array contexts, the encoded or -decoded string is returned (without changing the input string). - -If you prefer not to import these routines into your namespace, you can -call them as: - - use HTML::Entities (); - $decoded = HTML::Entities::decode($a); - $encoded = HTML::Entities::encode($a); - $encoded = HTML::Entities::encode_numeric($a); - -The module can also export the %char2entity and the %entity2char -hashes, which contain the mapping from all characters to the -corresponding entities (and vice versa, respectively). - -=head1 COPYRIGHT - -Copyright 1995-2006 Gisle Aas. All rights reserved. - -This library is free software; you can redistribute it and/or -modify it under the same terms as Perl itself. - -=cut - -use strict; -use vars qw(@ISA @EXPORT @EXPORT_OK $VERSION); -use vars qw(%entity2char %char2entity); - -require 5.004; -require Exporter; -@ISA = qw(Exporter); - -@EXPORT = qw(encode_entities decode_entities _decode_entities); -@EXPORT_OK = qw(%entity2char %char2entity encode_entities_numeric); - -$VERSION = sprintf("%d.%02d", q$Revision: 1.35 $ =~ /(\d+)\.(\d+)/); -sub Version { $VERSION; } - -require HTML::Parser; # for fast XS implemented decode_entities - - -%entity2char = ( - # Some normal chars that have special meaning in SGML context - amp => '&', # ampersand -'gt' => '>', # greater than -'lt' => '<', # less than - quot => '"', # double quote - apos => "'", # single quote - - # PUBLIC ISO 8879-1986//ENTITIES Added Latin 1//EN//HTML - AElig => chr(198), # capital AE diphthong (ligature) - Aacute => chr(193), # capital A, acute accent - Acirc => chr(194), # capital A, circumflex accent - Agrave => chr(192), # capital A, grave accent - Aring => chr(197), # capital A, ring - Atilde => chr(195), # capital A, tilde - Auml => chr(196), # capital A, dieresis or umlaut mark - Ccedil => chr(199), # capital C, cedilla - ETH => chr(208), # capital Eth, Icelandic - Eacute => chr(201), # capital E, acute accent - Ecirc => chr(202), # capital E, circumflex accent - Egrave => chr(200), # capital E, grave accent - Euml => chr(203), # capital E, dieresis or umlaut mark - Iacute => chr(205), # capital I, acute accent - Icirc => chr(206), # capital I, circumflex accent - Igrave => chr(204), # capital I, grave accent - Iuml => chr(207), # capital I, dieresis or umlaut mark - Ntilde => chr(209), # capital N, tilde - Oacute => chr(211), # capital O, acute accent - Ocirc => chr(212), # capital O, circumflex accent - Ograve => chr(210), # capital O, grave accent - Oslash => chr(216), # capital O, slash - Otilde => chr(213), # capital O, tilde - Ouml => chr(214), # capital O, dieresis or umlaut mark - THORN => chr(222), # capital THORN, Icelandic - Uacute => chr(218), # capital U, acute accent - Ucirc => chr(219), # capital U, circumflex accent - Ugrave => chr(217), # capital U, grave accent - Uuml => chr(220), # capital U, dieresis or umlaut mark - Yacute => chr(221), # capital Y, acute accent - aacute => chr(225), # small a, acute accent - acirc => chr(226), # small a, circumflex accent - aelig => chr(230), # small ae diphthong (ligature) - agrave => chr(224), # small a, grave accent - aring => chr(229), # small a, ring - atilde => chr(227), # small a, tilde - auml => chr(228), # small a, dieresis or umlaut mark - ccedil => chr(231), # small c, cedilla - eacute => chr(233), # small e, acute accent - ecirc => chr(234), # small e, circumflex accent - egrave => chr(232), # small e, grave accent - eth => chr(240), # small eth, Icelandic - euml => chr(235), # small e, dieresis or umlaut mark - iacute => chr(237), # small i, acute accent - icirc => chr(238), # small i, circumflex accent - igrave => chr(236), # small i, grave accent - iuml => chr(239), # small i, dieresis or umlaut mark - ntilde => chr(241), # small n, tilde - oacute => chr(243), # small o, acute accent - ocirc => chr(244), # small o, circumflex accent - ograve => chr(242), # small o, grave accent - oslash => chr(248), # small o, slash - otilde => chr(245), # small o, tilde - ouml => chr(246), # small o, dieresis or umlaut mark - szlig => chr(223), # small sharp s, German (sz ligature) - thorn => chr(254), # small thorn, Icelandic - uacute => chr(250), # small u, acute accent - ucirc => chr(251), # small u, circumflex accent - ugrave => chr(249), # small u, grave accent - uuml => chr(252), # small u, dieresis or umlaut mark - yacute => chr(253), # small y, acute accent - yuml => chr(255), # small y, dieresis or umlaut mark - - # Some extra Latin 1 chars that are listed in the HTML3.2 draft (21-May-96) - copy => chr(169), # copyright sign - reg => chr(174), # registered sign - nbsp => chr(160), # non breaking space - - # Additional ISO-8859/1 entities listed in rfc1866 (section 14) - iexcl => chr(161), - cent => chr(162), - pound => chr(163), - curren => chr(164), - yen => chr(165), - brvbar => chr(166), - sect => chr(167), - uml => chr(168), - ordf => chr(170), - laquo => chr(171), -'not' => chr(172), # not is a keyword in perl - shy => chr(173), - macr => chr(175), - deg => chr(176), - plusmn => chr(177), - sup1 => chr(185), - sup2 => chr(178), - sup3 => chr(179), - acute => chr(180), - micro => chr(181), - para => chr(182), - middot => chr(183), - cedil => chr(184), - ordm => chr(186), - raquo => chr(187), - frac14 => chr(188), - frac12 => chr(189), - frac34 => chr(190), - iquest => chr(191), -'times' => chr(215), # times is a keyword in perl - divide => chr(247), - - ( $] > 5.007 ? ( - 'OElig;' => chr(338), - 'oelig;' => chr(339), - 'Scaron;' => chr(352), - 'scaron;' => chr(353), - 'Yuml;' => chr(376), - 'fnof;' => chr(402), - 'circ;' => chr(710), - 'tilde;' => chr(732), - 'Alpha;' => chr(913), - 'Beta;' => chr(914), - 'Gamma;' => chr(915), - 'Delta;' => chr(916), - 'Epsilon;' => chr(917), - 'Zeta;' => chr(918), - 'Eta;' => chr(919), - 'Theta;' => chr(920), - 'Iota;' => chr(921), - 'Kappa;' => chr(922), - 'Lambda;' => chr(923), - 'Mu;' => chr(924), - 'Nu;' => chr(925), - 'Xi;' => chr(926), - 'Omicron;' => chr(927), - 'Pi;' => chr(928), - 'Rho;' => chr(929), - 'Sigma;' => chr(931), - 'Tau;' => chr(932), - 'Upsilon;' => chr(933), - 'Phi;' => chr(934), - 'Chi;' => chr(935), - 'Psi;' => chr(936), - 'Omega;' => chr(937), - 'alpha;' => chr(945), - 'beta;' => chr(946), - 'gamma;' => chr(947), - 'delta;' => chr(948), - 'epsilon;' => chr(949), - 'zeta;' => chr(950), - 'eta;' => chr(951), - 'theta;' => chr(952), - 'iota;' => chr(953), - 'kappa;' => chr(954), - 'lambda;' => chr(955), - 'mu;' => chr(956), - 'nu;' => chr(957), - 'xi;' => chr(958), - 'omicron;' => chr(959), - 'pi;' => chr(960), - 'rho;' => chr(961), - 'sigmaf;' => chr(962), - 'sigma;' => chr(963), - 'tau;' => chr(964), - 'upsilon;' => chr(965), - 'phi;' => chr(966), - 'chi;' => chr(967), - 'psi;' => chr(968), - 'omega;' => chr(969), - 'thetasym;' => chr(977), - 'upsih;' => chr(978), - 'piv;' => chr(982), - 'ensp;' => chr(8194), - 'emsp;' => chr(8195), - 'thinsp;' => chr(8201), - 'zwnj;' => chr(8204), - 'zwj;' => chr(8205), - 'lrm;' => chr(8206), - 'rlm;' => chr(8207), - 'ndash;' => chr(8211), - 'mdash;' => chr(8212), - 'lsquo;' => chr(8216), - 'rsquo;' => chr(8217), - 'sbquo;' => chr(8218), - 'ldquo;' => chr(8220), - 'rdquo;' => chr(8221), - 'bdquo;' => chr(8222), - 'dagger;' => chr(8224), - 'Dagger;' => chr(8225), - 'bull;' => chr(8226), - 'hellip;' => chr(8230), - 'permil;' => chr(8240), - 'prime;' => chr(8242), - 'Prime;' => chr(8243), - 'lsaquo;' => chr(8249), - 'rsaquo;' => chr(8250), - 'oline;' => chr(8254), - 'frasl;' => chr(8260), - 'euro;' => chr(8364), - 'image;' => chr(8465), - 'weierp;' => chr(8472), - 'real;' => chr(8476), - 'trade;' => chr(8482), - 'alefsym;' => chr(8501), - 'larr;' => chr(8592), - 'uarr;' => chr(8593), - 'rarr;' => chr(8594), - 'darr;' => chr(8595), - 'harr;' => chr(8596), - 'crarr;' => chr(8629), - 'lArr;' => chr(8656), - 'uArr;' => chr(8657), - 'rArr;' => chr(8658), - 'dArr;' => chr(8659), - 'hArr;' => chr(8660), - 'forall;' => chr(8704), - 'part;' => chr(8706), - 'exist;' => chr(8707), - 'empty;' => chr(8709), - 'nabla;' => chr(8711), - 'isin;' => chr(8712), - 'notin;' => chr(8713), - 'ni;' => chr(8715), - 'prod;' => chr(8719), - 'sum;' => chr(8721), - 'minus;' => chr(8722), - 'lowast;' => chr(8727), - 'radic;' => chr(8730), - 'prop;' => chr(8733), - 'infin;' => chr(8734), - 'ang;' => chr(8736), - 'and;' => chr(8743), - 'or;' => chr(8744), - 'cap;' => chr(8745), - 'cup;' => chr(8746), - 'int;' => chr(8747), - 'there4;' => chr(8756), - 'sim;' => chr(8764), - 'cong;' => chr(8773), - 'asymp;' => chr(8776), - 'ne;' => chr(8800), - 'equiv;' => chr(8801), - 'le;' => chr(8804), - 'ge;' => chr(8805), - 'sub;' => chr(8834), - 'sup;' => chr(8835), - 'nsub;' => chr(8836), - 'sube;' => chr(8838), - 'supe;' => chr(8839), - 'oplus;' => chr(8853), - 'otimes;' => chr(8855), - 'perp;' => chr(8869), - 'sdot;' => chr(8901), - 'lceil;' => chr(8968), - 'rceil;' => chr(8969), - 'lfloor;' => chr(8970), - 'rfloor;' => chr(8971), - 'lang;' => chr(9001), - 'rang;' => chr(9002), - 'loz;' => chr(9674), - 'spades;' => chr(9824), - 'clubs;' => chr(9827), - 'hearts;' => chr(9829), - 'diams;' => chr(9830), - ) : ()) -); - - -# Make the opposite mapping -while (my($entity, $char) = each(%entity2char)) { - $entity =~ s/;\z//; - $char2entity{$char} = "&$entity;"; -} -delete $char2entity{"'"}; # only one-way decoding - -# Fill in missing entities -for (0 .. 255) { - next if exists $char2entity{chr($_)}; - $char2entity{chr($_)} = "&#$_;"; -} - -my %subst; # compiled encoding regexps - -sub decode_entities_old -{ - my $array; - if (defined wantarray) { - $array = [@_]; # copy - } else { - $array = \@_; # modify in-place - } - my $c; - for (@$array) { - s/(&\#(\d+);?)/$2 < 256 ? chr($2) : $1/eg; - s/(&\#[xX]([0-9a-fA-F]+);?)/$c = hex($2); $c < 256 ? chr($c) : $1/eg; - s/(&(\w+);?)/$entity2char{$2} || $1/eg; - } - wantarray ? @$array : $array->[0]; -} - -sub encode_entities -{ - my $ref; - if (defined wantarray) { - my $x = $_[0]; - $ref = \$x; # copy - } else { - $ref = \$_[0]; # modify in-place - } - if (defined $_[1] and length $_[1]) { - unless (exists $subst{$_[1]}) { - # Because we can't compile regex we fake it with a cached sub - my $code = "sub {\$_[0] =~ s/([$_[1]])/\$char2entity{\$1} || num_entity(\$1)/ge; }"; - $subst{$_[1]} = eval $code; - die( $@ . " while trying to turn range: \"$_[1]\"\n " - . "into code: $code\n " - ) if $@; - } - &{$subst{$_[1]}}($$ref); - } else { - # Encode control chars, high bit chars and '<', '&', '>', ''' and '"' - $$ref =~ s/([^\n\r\t !\#\$%\(-;=?-~])/$char2entity{$1} || num_entity($1)/ge; - } - $$ref; -} - -sub encode_entities_numeric { - local %char2entity; - return &encode_entities; # a goto &encode_entities wouldn't work -} - - -sub num_entity { - sprintf "&#x%X;", ord($_[0]); -} - -# Set up aliases -*encode = \&encode_entities; -*encode_numeric = \&encode_entities_numeric; -*encode_numerically = \&encode_entities_numeric; -*decode = \&decode_entities; - -1; diff --git a/chromium/third_party/cygwin/lib/perl5/vendor_perl/5.10/i686-cygwin/HTML/Filter.pm b/chromium/third_party/cygwin/lib/perl5/vendor_perl/5.10/i686-cygwin/HTML/Filter.pm deleted file mode 100644 index 21fafac621a..00000000000 --- a/chromium/third_party/cygwin/lib/perl5/vendor_perl/5.10/i686-cygwin/HTML/Filter.pm +++ /dev/null @@ -1,112 +0,0 @@ -package HTML::Filter; - -use strict; -use vars qw(@ISA $VERSION); - -require HTML::Parser; -@ISA=qw(HTML::Parser); - -$VERSION = sprintf("%d.%02d", q$Revision: 2.11 $ =~ /(\d+)\.(\d+)/); - -sub declaration { $_[0]->output("<!$_[1]>") } -sub process { $_[0]->output($_[2]) } -sub comment { $_[0]->output("<!--$_[1]-->") } -sub start { $_[0]->output($_[4]) } -sub end { $_[0]->output($_[2]) } -sub text { $_[0]->output($_[1]) } - -sub output { print $_[1] } - -1; - -__END__ - -=head1 NAME - -HTML::Filter - Filter HTML text through the parser - -=head1 NOTE - -B<This module is deprecated.> The C<HTML::Parser> now provides the -functionally of C<HTML::Filter> much more efficiently with the the -C<default> handler. - -=head1 SYNOPSIS - - require HTML::Filter; - $p = HTML::Filter->new->parse_file("index.html"); - -=head1 DESCRIPTION - -C<HTML::Filter> is an HTML parser that by default prints the -original text of each HTML element (a slow version of cat(1) basically). -The callback methods may be overridden to modify the filtering for some -HTML elements and you can override output() method which is called to -print the HTML text. - -C<HTML::Filter> is a subclass of C<HTML::Parser>. This means that -the document should be given to the parser by calling the $p->parse() -or $p->parse_file() methods. - -=head1 EXAMPLES - -The first example is a filter that will remove all comments from an -HTML file. This is achieved by simply overriding the comment method -to do nothing. - - package CommentStripper; - require HTML::Filter; - @ISA=qw(HTML::Filter); - sub comment { } # ignore comments - -The second example shows a filter that will remove any E<lt>TABLE>s -found in the HTML file. We specialize the start() and end() methods -to count table tags and then make output not happen when inside a -table. - - package TableStripper; - require HTML::Filter; - @ISA=qw(HTML::Filter); - sub start - { - my $self = shift; - $self->{table_seen}++ if $_[0] eq "table"; - $self->SUPER::start(@_); - } - - sub end - { - my $self = shift; - $self->SUPER::end(@_); - $self->{table_seen}-- if $_[0] eq "table"; - } - - sub output - { - my $self = shift; - unless ($self->{table_seen}) { - $self->SUPER::output(@_); - } - } - -If you want to collect the parsed text internally you might want to do -something like this: - - package FilterIntoString; - require HTML::Filter; - @ISA=qw(HTML::Filter); - sub output { push(@{$_[0]->{fhtml}}, $_[1]) } - sub filtered_html { join("", @{$_[0]->{fhtml}}) } - -=head1 SEE ALSO - -L<HTML::Parser> - -=head1 COPYRIGHT - -Copyright 1997-1999 Gisle Aas. - -This library is free software; you can redistribute it and/or -modify it under the same terms as Perl itself. - -=cut diff --git a/chromium/third_party/cygwin/lib/perl5/vendor_perl/5.10/i686-cygwin/HTML/HeadParser.pm b/chromium/third_party/cygwin/lib/perl5/vendor_perl/5.10/i686-cygwin/HTML/HeadParser.pm deleted file mode 100644 index a8974f832b6..00000000000 --- a/chromium/third_party/cygwin/lib/perl5/vendor_perl/5.10/i686-cygwin/HTML/HeadParser.pm +++ /dev/null @@ -1,259 +0,0 @@ -package HTML::HeadParser; - -=head1 NAME - -HTML::HeadParser - Parse <HEAD> section of a HTML document - -=head1 SYNOPSIS - - require HTML::HeadParser; - $p = HTML::HeadParser->new; - $p->parse($text) and print "not finished"; - - $p->header('Title') # to access <title>....</title> - $p->header('Content-Base') # to access <base href="http://..."> - $p->header('Foo') # to access <meta http-equiv="Foo" content="..."> - -=head1 DESCRIPTION - -The C<HTML::HeadParser> is a specialized (and lightweight) -C<HTML::Parser> that will only parse the E<lt>HEAD>...E<lt>/HEAD> -section of an HTML document. The parse() method -will return a FALSE value as soon as some E<lt>BODY> element or body -text are found, and should not be called again after this. - -Note that the C<HTML::HeadParser> might get confused if raw undecoded -UTF-8 is passed to the parse() method. Make sure the strings are -properly decoded before passing them on. - -The C<HTML::HeadParser> keeps a reference to a header object, and the -parser will update this header object as the various elements of the -E<lt>HEAD> section of the HTML document are recognized. The following -header fields are affected: - -=over 4 - -=item Content-Base: - -The I<Content-Base> header is initialized from the E<lt>base -href="..."> element. - -=item Title: - -The I<Title> header is initialized from the E<lt>title>...E<lt>/title> -element. - -=item Isindex: - -The I<Isindex> header will be added if there is a E<lt>isindex> -element in the E<lt>head>. The header value is initialized from the -I<prompt> attribute if it is present. If no I<prompt> attribute is -given it will have '?' as the value. - -=item X-Meta-Foo: - -All E<lt>meta> elements will initialize headers with the prefix -"C<X-Meta->" on the name. If the E<lt>meta> element contains a -C<http-equiv> attribute, then it will be honored as the header name. - -=back - -=head1 METHODS - -The following methods (in addition to those provided by the -superclass) are available: - -=over 4 - -=cut - - -require HTML::Parser; -@ISA = qw(HTML::Parser); - -use HTML::Entities (); - -use strict; -use vars qw($VERSION $DEBUG); -#$DEBUG = 1; -$VERSION = sprintf("%d.%02d", q$Revision: 2.22 $ =~ /(\d+)\.(\d+)/); - -=item $hp = HTML::HeadParser->new - -=item $hp = HTML::HeadParser->new( $header ) - -The object constructor. The optional $header argument should be a -reference to an object that implement the header() and push_header() -methods as defined by the C<HTTP::Headers> class. Normally it will be -of some class that isa or delegates to the C<HTTP::Headers> class. - -If no $header is given C<HTML::HeadParser> will create an -C<HTTP::Header> object by itself (initially empty). - -=cut - -sub new -{ - my($class, $header) = @_; - unless ($header) { - require HTTP::Headers; - $header = HTTP::Headers->new; - } - - my $self = $class->SUPER::new(api_version => 2, - ignore_elements => [qw(script style)], - ); - $self->{'header'} = $header; - $self->{'tag'} = ''; # name of active element that takes textual content - $self->{'text'} = ''; # the accumulated text associated with the element - $self; -} - -=item $hp->header; - -Returns a reference to the header object. - -=item $hp->header( $key ) - -Returns a header value. It is just a shorter way to write -C<$hp-E<gt>header-E<gt>header($key)>. - -=cut - -sub header -{ - my $self = shift; - return $self->{'header'} unless @_; - $self->{'header'}->header(@_); -} - -sub as_string # legacy -{ - my $self = shift; - $self->{'header'}->as_string; -} - -sub flush_text # internal -{ - my $self = shift; - my $tag = $self->{'tag'}; - my $text = $self->{'text'}; - $text =~ s/^\s+//; - $text =~ s/\s+$//; - $text =~ s/\s+/ /g; - print "FLUSH $tag => '$text'\n" if $DEBUG; - if ($tag eq 'title') { - HTML::Entities::decode($text); - $self->{'header'}->push_header(Title => $text); - } - $self->{'tag'} = $self->{'text'} = ''; -} - -# This is an quote from the HTML3.2 DTD which shows which elements -# that might be present in a <HEAD>...</HEAD>. Also note that the -# <HEAD> tags themselves might be missing: -# -# <!ENTITY % head.content "TITLE & ISINDEX? & BASE? & STYLE? & -# SCRIPT* & META* & LINK*"> -# -# <!ELEMENT HEAD O O (%head.content)> - - -sub start -{ - my($self, $tag, $attr) = @_; # $attr is reference to a HASH - print "START[$tag]\n" if $DEBUG; - $self->flush_text if $self->{'tag'}; - if ($tag eq 'meta') { - my $key = $attr->{'http-equiv'}; - if (!defined($key) || !length($key)) { - return unless $attr->{'name'}; - $key = "X-Meta-\u$attr->{'name'}"; - } - $self->{'header'}->push_header($key => $attr->{content}); - } elsif ($tag eq 'base') { - return unless exists $attr->{href}; - $self->{'header'}->push_header('Content-Base' => $attr->{href}); - } elsif ($tag eq 'isindex') { - # This is a non-standard header. Perhaps we should just ignore - # this element - $self->{'header'}->push_header(Isindex => $attr->{prompt} || '?'); - } elsif ($tag =~ /^(?:title|script|style)$/) { - # Just remember tag. Initialize header when we see the end tag. - $self->{'tag'} = $tag; - } elsif ($tag eq 'link') { - return unless exists $attr->{href}; - # <link href="http:..." rel="xxx" rev="xxx" title="xxx"> - my $h_val = "<" . delete($attr->{href}) . ">"; - for (sort keys %{$attr}) { - $h_val .= qq(; $_="$attr->{$_}"); - } - $self->{'header'}->push_header(Link => $h_val); - } elsif ($tag eq 'head' || $tag eq 'html') { - # ignore - } else { - # stop parsing - $self->eof; - } -} - -sub end -{ - my($self, $tag) = @_; - print "END[$tag]\n" if $DEBUG; - $self->flush_text if $self->{'tag'}; - $self->eof if $tag eq 'head'; -} - -sub text -{ - my($self, $text) = @_; - $text =~ s/\x{FEFF}//; # drop Unicode BOM if found - print "TEXT[$text]\n" if $DEBUG; - my $tag = $self->{tag}; - if (!$tag && $text =~ /\S/) { - # Normal text means start of body - $self->eof; - return; - } - return if $tag ne 'title'; - $self->{'text'} .= $text; -} - -1; - -__END__ - -=back - -=head1 EXAMPLE - - $h = HTTP::Headers->new; - $p = HTML::HeadParser->new($h); - $p->parse(<<EOT); - <title>Stupid example</title> - <base href="http://www.linpro.no/lwp/"> - Normal text starts here. - EOT - undef $p; - print $h->title; # should print "Stupid example" - -=head1 SEE ALSO - -L<HTML::Parser>, L<HTTP::Headers> - -The C<HTTP::Headers> class is distributed as part of the -I<libwww-perl> package. If you don't have that distribution installed -you need to provide the $header argument to the C<HTML::HeadParser> -constructor with your own object that implements the documented -protocol. - -=head1 COPYRIGHT - -Copyright 1996-2001 Gisle Aas. All rights reserved. - -This library is free software; you can redistribute it and/or -modify it under the same terms as Perl itself. - -=cut - diff --git a/chromium/third_party/cygwin/lib/perl5/vendor_perl/5.10/i686-cygwin/HTML/LinkExtor.pm b/chromium/third_party/cygwin/lib/perl5/vendor_perl/5.10/i686-cygwin/HTML/LinkExtor.pm deleted file mode 100644 index d543a5aba7b..00000000000 --- a/chromium/third_party/cygwin/lib/perl5/vendor_perl/5.10/i686-cygwin/HTML/LinkExtor.pm +++ /dev/null @@ -1,187 +0,0 @@ -package HTML::LinkExtor; - -# $Id: LinkExtor.pm,v 1.33 2003/10/10 10:20:56 gisle Exp $ - -require HTML::Parser; -@ISA = qw(HTML::Parser); -$VERSION = sprintf("%d.%02d", q$Revision: 1.33 $ =~ /(\d+)\.(\d+)/); - -=head1 NAME - -HTML::LinkExtor - Extract links from an HTML document - -=head1 SYNOPSIS - - require HTML::LinkExtor; - $p = HTML::LinkExtor->new(\&cb, "http://www.perl.org/"); - sub cb { - my($tag, %links) = @_; - print "$tag @{[%links]}\n"; - } - $p->parse_file("index.html"); - -=head1 DESCRIPTION - -I<HTML::LinkExtor> is an HTML parser that extracts links from an -HTML document. The I<HTML::LinkExtor> is a subclass of -I<HTML::Parser>. This means that the document should be given to the -parser by calling the $p->parse() or $p->parse_file() methods. - -=cut - -use strict; -use HTML::Tagset (); - -# legacy (some applications grabs this hash directly) -use vars qw(%LINK_ELEMENT); -*LINK_ELEMENT = \%HTML::Tagset::linkElements; - -=over 4 - -=item $p = HTML::LinkExtor->new - -=item $p = HTML::LinkExtor->new( $callback ) - -=item $p = HTML::LinkExtor->new( $callback, $base ) - -The constructor takes two optional arguments. The first is a reference -to a callback routine. It will be called as links are found. If a -callback is not provided, then links are just accumulated internally -and can be retrieved by calling the $p->links() method. - -The $base argument is an optional base URL used to absolutize all URLs found. -You need to have the I<URI> module installed if you provide $base. - -The callback is called with the lowercase tag name as first argument, -and then all link attributes as separate key/value pairs. All -non-link attributes are removed. - -=cut - -sub new -{ - my($class, $cb, $base) = @_; - my $self = $class->SUPER::new( - start_h => ["_start_tag", "self,tagname,attr"], - report_tags => [keys %HTML::Tagset::linkElements], - ); - $self->{extractlink_cb} = $cb; - if ($base) { - require URI; - $self->{extractlink_base} = URI->new($base); - } - $self; -} - -sub _start_tag -{ - my($self, $tag, $attr) = @_; - - my $base = $self->{extractlink_base}; - my $links = $HTML::Tagset::linkElements{$tag}; - $links = [$links] unless ref $links; - - my @links; - my $a; - for $a (@$links) { - next unless exists $attr->{$a}; - push(@links, $a, $base ? URI->new($attr->{$a}, $base)->abs($base) - : $attr->{$a}); - } - return unless @links; - $self->_found_link($tag, @links); -} - -sub _found_link -{ - my $self = shift; - my $cb = $self->{extractlink_cb}; - if ($cb) { - &$cb(@_); - } else { - push(@{$self->{'links'}}, [@_]); - } -} - -=item $p->links - -Returns a list of all links found in the document. The returned -values will be anonymous arrays with the follwing elements: - - [$tag, $attr => $url1, $attr2 => $url2,...] - -The $p->links method will also truncate the internal link list. This -means that if the method is called twice without any parsing -between them the second call will return an empty list. - -Also note that $p->links will always be empty if a callback routine -was provided when the I<HTML::LinkExtor> was created. - -=cut - -sub links -{ - my $self = shift; - exists($self->{'links'}) ? @{delete $self->{'links'}} : (); -} - -# We override the parse_file() method so that we can clear the links -# before we start a new file. -sub parse_file -{ - my $self = shift; - delete $self->{'links'}; - $self->SUPER::parse_file(@_); -} - -=back - -=head1 EXAMPLE - -This is an example showing how you can extract links from a document -received using LWP: - - use LWP::UserAgent; - use HTML::LinkExtor; - use URI::URL; - - $url = "http://www.perl.org/"; # for instance - $ua = LWP::UserAgent->new; - - # Set up a callback that collect image links - my @imgs = (); - sub callback { - my($tag, %attr) = @_; - return if $tag ne 'img'; # we only look closer at <img ...> - push(@imgs, values %attr); - } - - # Make the parser. Unfortunately, we don't know the base yet - # (it might be diffent from $url) - $p = HTML::LinkExtor->new(\&callback); - - # Request document and parse it as it arrives - $res = $ua->request(HTTP::Request->new(GET => $url), - sub {$p->parse($_[0])}); - - # Expand all image URLs to absolute ones - my $base = $res->base; - @imgs = map { $_ = url($_, $base)->abs; } @imgs; - - # Print them out - print join("\n", @imgs), "\n"; - -=head1 SEE ALSO - -L<HTML::Parser>, L<HTML::Tagset>, L<LWP>, L<URI::URL> - -=head1 COPYRIGHT - -Copyright 1996-2001 Gisle Aas. - -This library is free software; you can redistribute it and/or -modify it under the same terms as Perl itself. - -=cut - -1; diff --git a/chromium/third_party/cygwin/lib/perl5/vendor_perl/5.10/i686-cygwin/HTML/Parser.pm b/chromium/third_party/cygwin/lib/perl5/vendor_perl/5.10/i686-cygwin/HTML/Parser.pm deleted file mode 100644 index 72d5a9841fa..00000000000 --- a/chromium/third_party/cygwin/lib/perl5/vendor_perl/5.10/i686-cygwin/HTML/Parser.pm +++ /dev/null @@ -1,1233 +0,0 @@ -package HTML::Parser; - -# Copyright 1996-2007, Gisle Aas. -# Copyright 1999-2000, Michael A. Chase. -# -# This library is free software; you can redistribute it and/or -# modify it under the same terms as Perl itself. - -use strict; -use vars qw($VERSION @ISA); - -$VERSION = '3.56'; # $Date: 2007/01/12 09:18:31 $ - -require HTML::Entities; - -require XSLoader; -XSLoader::load('HTML::Parser', $VERSION); - -sub new -{ - my $class = shift; - my $self = bless {}, $class; - return $self->init(@_); -} - - -sub init -{ - my $self = shift; - $self->_alloc_pstate; - - my %arg = @_; - my $api_version = delete $arg{api_version} || (@_ ? 3 : 2); - if ($api_version >= 4) { - require Carp; - Carp::croak("API version $api_version not supported " . - "by HTML::Parser $VERSION"); - } - - if ($api_version < 3) { - # Set up method callbacks compatible with HTML-Parser-2.xx - $self->handler(text => "text", "self,text,is_cdata"); - $self->handler(end => "end", "self,tagname,text"); - $self->handler(process => "process", "self,token0,text"); - $self->handler(start => "start", - "self,tagname,attr,attrseq,text"); - - $self->handler(comment => - sub { - my($self, $tokens) = @_; - for (@$tokens) { - $self->comment($_); - } - }, "self,tokens"); - - $self->handler(declaration => - sub { - my $self = shift; - $self->declaration(substr($_[0], 2, -1)); - }, "self,text"); - } - - if (my $h = delete $arg{handlers}) { - $h = {@$h} if ref($h) eq "ARRAY"; - while (my($event, $cb) = each %$h) { - $self->handler($event => @$cb); - } - } - - # In the end we try to assume plain attribute or handler - while (my($option, $val) = each %arg) { - if ($option =~ /^(\w+)_h$/) { - $self->handler($1 => @$val); - } - elsif ($option =~ /^(text|start|end|process|declaration|comment)$/) { - require Carp; - Carp::croak("Bad constructor option '$option'"); - } - else { - $self->$option($val); - } - } - - return $self; -} - - -sub parse_file -{ - my($self, $file) = @_; - my $opened; - if (!ref($file) && ref(\$file) ne "GLOB") { - # Assume $file is a filename - local(*F); - open(F, $file) || return undef; - binmode(F); # should we? good for byte counts - $opened++; - $file = *F; - } - my $chunk = ''; - while (read($file, $chunk, 512)) { - $self->parse($chunk) || last; - } - close($file) if $opened; - $self->eof; -} - - -sub netscape_buggy_comment # legacy -{ - my $self = shift; - require Carp; - Carp::carp("netscape_buggy_comment() is deprecated. " . - "Please use the strict_comment() method instead"); - my $old = !$self->strict_comment; - $self->strict_comment(!shift) if @_; - return $old; -} - -# set up method stubs -sub text { } -*start = \&text; -*end = \&text; -*comment = \&text; -*declaration = \&text; -*process = \&text; - -1; - -__END__ - - -=head1 NAME - -HTML::Parser - HTML parser class - -=head1 SYNOPSIS - - use HTML::Parser (); - - # Create parser object - $p = HTML::Parser->new( api_version => 3, - start_h => [\&start, "tagname, attr"], - end_h => [\&end, "tagname"], - marked_sections => 1, - ); - - # Parse document text chunk by chunk - $p->parse($chunk1); - $p->parse($chunk2); - #... - $p->eof; # signal end of document - - # Parse directly from file - $p->parse_file("foo.html"); - # or - open(my $fh, "<:utf8", "foo.html") || die; - $p->parse_file($fh); - -=head1 DESCRIPTION - -Objects of the C<HTML::Parser> class will recognize markup and -separate it from plain text (alias data content) in HTML -documents. As different kinds of markup and text are recognized, the -corresponding event handlers are invoked. - -C<HTML::Parser> is not a generic SGML parser. We have tried to -make it able to deal with the HTML that is actually "out there", and -it normally parses as closely as possible to the way the popular web -browsers do it instead of strictly following one of the many HTML -specifications from W3C. Where there is disagreement, there is often -an option that you can enable to get the official behaviour. - -The document to be parsed may be supplied in arbitrary chunks. This -makes on-the-fly parsing as documents are received from the network -possible. - -If event driven parsing does not feel right for your application, you -might want to use C<HTML::PullParser>. This is an C<HTML::Parser> -subclass that allows a more conventional program structure. - - -=head1 METHODS - -The following method is used to construct a new C<HTML::Parser> object: - -=over - -=item $p = HTML::Parser->new( %options_and_handlers ) - -This class method creates a new C<HTML::Parser> object and -returns it. Key/value argument pairs may be provided to assign event -handlers or initialize parser options. The handlers and parser -options can also be set or modified later by the method calls described below. - -If a top level key is in the form "<event>_h" (e.g., "text_h") then it -assigns a handler to that event, otherwise it initializes a parser -option. The event handler specification value must be an array -reference. Multiple handlers may also be assigned with the 'handlers -=> [%handlers]' option. See examples below. - -If new() is called without any arguments, it will create a parser that -uses callback methods compatible with version 2 of C<HTML::Parser>. -See the section on "version 2 compatibility" below for details. - -The special constructor option 'api_version => 2' can be used to -initialize version 2 callbacks while still setting other options and -handlers. The 'api_version => 3' option can be used if you don't want -to set any options and don't want to fall back to v2 compatible -mode. - -Examples: - - $p = HTML::Parser->new(api_version => 3, - text_h => [ sub {...}, "dtext" ]); - -This creates a new parser object with a text event handler subroutine -that receives the original text with general entities decoded. - - $p = HTML::Parser->new(api_version => 3, - start_h => [ 'my_start', "self,tokens" ]); - -This creates a new parser object with a start event handler method -that receives the $p and the tokens array. - - $p = HTML::Parser->new(api_version => 3, - handlers => { text => [\@array, "event,text"], - comment => [\@array, "event,text"], - }); - -This creates a new parser object that stores the event type and the -original text in @array for text and comment events. - -=back - -The following methods feed the HTML document -to the C<HTML::Parser> object: - -=over - -=item $p->parse( $string ) - -Parse $string as the next chunk of the HTML document. The return -value is normally a reference to the parser object (i.e. $p). -Handlers invoked should not attempt to modify the $string in-place until -$p->parse returns. - -If an invoked event handler aborts parsing by calling $p->eof, then -$p->parse() will return a FALSE value. - -=item $p->parse( $code_ref ) - -If a code reference is passed as the argument to be parsed, then the -chunks to be parsed are obtained by invoking this function repeatedly. -Parsing continues until the function returns an empty (or undefined) -result. When this happens $p->eof is automatically signaled. - -Parsing will also abort if one of the event handlers calls $p->eof. - -The effect of this is the same as: - - while (1) { - my $chunk = &$code_ref(); - if (!defined($chunk) || !length($chunk)) { - $p->eof; - return $p; - } - $p->parse($chunk) || return undef; - } - -But it is more efficient as this loop runs internally in XS code. - -=item $p->parse_file( $file ) - -Parse text directly from a file. The $file argument can be a -filename, an open file handle, or a reference to an open file -handle. - -If $file contains a filename and the file can't be opened, then the -method returns an undefined value and $! tells why it failed. -Otherwise the return value is a reference to the parser object. - -If a file handle is passed as the $file argument, then the file will -normally be read until EOF, but not closed. - -If an invoked event handler aborts parsing by calling $p->eof, -then $p->parse_file() may not have read the entire file. - -On systems with multi-byte line terminators, the values passed for the -offset and length argspecs may be too low if parse_file() is called on -a file handle that is not in binary mode. - -If a filename is passed in, then parse_file() will open the file in -binary mode. - -=item $p->eof - -Signals the end of the HTML document. Calling the $p->eof method -outside a handler callback will flush any remaining buffered text -(which triggers the C<text> event if there is any remaining text). - -Calling $p->eof inside a handler will terminate parsing at that point -and cause $p->parse to return a FALSE value. This also terminates -parsing by $p->parse_file(). - -After $p->eof has been called, the parse() and parse_file() methods -can be invoked to feed new documents with the parser object. - -The return value from eof() is a reference to the parser object. - -=back - - -Most parser options are controlled by boolean attributes. -Each boolean attribute is enabled by calling the corresponding method -with a TRUE argument and disabled with a FALSE argument. The -attribute value is left unchanged if no argument is given. The return -value from each method is the old attribute value. - -Methods that can be used to get and/or set parser options are: - -=over - -=item $p->attr_encoded - -=item $p->attr_encoded( $bool ) - -By default, the C<attr> and C<@attr> argspecs will have general -entities for attribute values decoded. Enabling this attribute leaves -entities alone. - -=item $p->boolean_attribute_value( $val ) - -This method sets the value reported for boolean attributes inside HTML -start tags. By default, the name of the attribute is also used as its -value. This affects the values reported for C<tokens> and C<attr> -argspecs. - -=item $p->case_sensitive - -=item $p->case_sensitive( $bool ) - -By default, tagnames and attribute names are down-cased. Enabling this -attribute leaves them as found in the HTML source document. - -=item $p->closing_plaintext - -=item $p->closing_plaintext( $bool ) - -By default, "plaintext" element can never be closed. Everything up to -the end of the document is parsed in CDATA mode. This historical -behaviour is what at least MSIE does. Enabling this attribute makes -closing "</plaintext>" tag effective and the parsing process will resume -after seeing this tag. This emulates gecko-based browsers. - -=item $p->empty_element_tags - -=item $p->empty_element_tags( $bool ) - -By default, empty element tags are not recognized as such and the "/" -before ">" is just treated like a normal name character (unless -C<strict_names> is enabled). Enabling this attribute make -C<HTML::Parser> recognize these tags. - -Empty element tags look like start tags, but end with the character -sequence "/>" instead of ">". When recognized by C<HTML::Parser> they -cause an artificial end event in addition to the start event. The -C<text> for the artificial end event will be empty and the C<tokenpos> -array will be undefined even though the the token array will have one -element containing the tag name. - -=item $p->marked_sections - -=item $p->marked_sections( $bool ) - -By default, section markings like <![CDATA[...]]> are treated like -ordinary text. When this attribute is enabled section markings are -honoured. - -There are currently no events associated with the marked section -markup, but the text can be returned as C<skipped_text>. - -=item $p->strict_comment - -=item $p->strict_comment( $bool ) - -By default, comments are terminated by the first occurrence of "-->". -This is the behaviour of most popular browsers (like Mozilla, Opera and -MSIE), but it is not correct according to the official HTML -standard. Officially, you need an even number of "--" tokens before -the closing ">" is recognized and there may not be anything but -whitespace between an even and an odd "--". - -The official behaviour is enabled by enabling this attribute. - -Enabling of 'strict_comment' also disables recognizing these forms as -comments: - - </ comment> - <! comment> - - -=item $p->strict_end - -=item $p->strict_end( $bool ) - -By default, attributes and other junk are allowed to be present on end tags in a -manner that emulates MSIE's behaviour. - -The official behaviour is enabled with this attribute. If enabled, -only whitespace is allowed between the tagname and the final ">". - -=item $p->strict_names - -=item $p->strict_names( $bool ) - -By default, almost anything is allowed in tag and attribute names. -This is the behaviour of most popular browsers and allows us to parse -some broken tags with invalid attribute values like: - - <IMG SRC=newprevlstGr.gif ALT=[PREV LIST] BORDER=0> - -By default, "LIST]" is parsed as a boolean attribute, not as -part of the ALT value as was clearly intended. This is also what -Mozilla sees. - -The official behaviour is enabled by enabling this attribute. If -enabled, it will cause the tag above to be reported as text -since "LIST]" is not a legal attribute name. - -=item $p->unbroken_text - -=item $p->unbroken_text( $bool ) - -By default, blocks of text are given to the text handler as soon as -possible (but the parser takes care always to break text at a -boundary between whitespace and non-whitespace so single words and -entities can always be decoded safely). This might create breaks that -make it hard to do transformations on the text. When this attribute is -enabled, blocks of text are always reported in one piece. This will -delay the text event until the following (non-text) event has been -recognized by the parser. - -Note that the C<offset> argspec will give you the offset of the first -segment of text and C<length> is the combined length of the segments. -Since there might be ignored tags in between, these numbers can't be -used to directly index in the original document file. - -=item $p->utf8_mode - -=item $p->utf8_mode( $bool ) - -Enable this option when parsing raw undecoded UTF-8. This tells the -parser that the entities expanded for strings reported by C<attr>, -C<@attr> and C<dtext> should be expanded as decoded UTF-8 so they end -up compatible with the surrounding text. - -If C<utf8_mode> is enabled then it is an error to pass strings -containing characters with code above 255 to the parse() method, and -the parse() method will croak if you try. - -Example: The Unicode character "\x{2665}" is "\xE2\x99\xA5" when UTF-8 -encoded. The character can also be represented by the entity -"♥" or "♥". If we feed the parser: - - $p->parse("\xE2\x99\xA5♥"); - -then C<dtext> will be reported as "\xE2\x99\xA5\x{2665}" without -C<utf8_mode> enabled, but as "\xE2\x99\xA5\xE2\x99\xA5" when enabled. -The later string is what you want. - -This option is only available with perl-5.8 or better. - -=item $p->xml_mode - -=item $p->xml_mode( $bool ) - -Enabling this attribute changes the parser to allow some XML -constructs. This enables the behaviour controlled by individually by -the C<case_sensitive>, C<empty_element_tags>, C<strict_names> and -C<xml_pic> attributes and also suppresses special treatment of -elements that are parsed as CDATA for HTML. - -=item $p->xml_pic - -=item $p->xml_pic( $bool ) - -By default, I<processing instructions> are terminated by ">". When -this attribute is enabled, processing instructions are terminated by -"?>" instead. - -=back - -As markup and text is recognized, handlers are invoked. The following -method is used to set up handlers for different events: - -=over - -=item $p->handler( event => \&subroutine, $argspec ) - -=item $p->handler( event => $method_name, $argspec ) - -=item $p->handler( event => \@accum, $argspec ) - -=item $p->handler( event => "" ); - -=item $p->handler( event => undef ); - -=item $p->handler( event ); - -This method assigns a subroutine, method, or array to handle an event. - -Event is one of C<text>, C<start>, C<end>, C<declaration>, C<comment>, -C<process>, C<start_document>, C<end_document> or C<default>. - -The C<\&subroutine> is a reference to a subroutine which is called to handle -the event. - -The C<$method_name> is the name of a method of $p which is called to handle -the event. - -The C<@accum> is an array that will hold the event information as -sub-arrays. - -If the second argument is "", the event is ignored. -If it is undef, the default handler is invoked for the event. - -The C<$argspec> is a string that describes the information to be reported -for the event. Any requested information that does not apply to a -specific event is passed as C<undef>. If argspec is omitted, then it -is left unchanged. - -The return value from $p->handler is the old callback routine or a -reference to the accumulator array. - -Any return values from handler callback routines/methods are always -ignored. A handler callback can request parsing to be aborted by -invoking the $p->eof method. A handler callback is not allowed to -invoke the $p->parse() or $p->parse_file() method. An exception will -be raised if it tries. - -Examples: - - $p->handler(start => "start", 'self, attr, attrseq, text' ); - -This causes the "start" method of object $p to be called for 'start' events. -The callback signature is $p->start(\%attr, \@attr_seq, $text). - - $p->handler(start => \&start, 'attr, attrseq, text' ); - -This causes subroutine start() to be called for 'start' events. -The callback signature is start(\%attr, \@attr_seq, $text). - - $p->handler(start => \@accum, '"S", attr, attrseq, text' ); - -This causes 'start' event information to be saved in @accum. -The array elements will be ['S', \%attr, \@attr_seq, $text]. - - $p->handler(start => ""); - -This causes 'start' events to be ignored. It also suppresses -invocations of any default handler for start events. It is in most -cases equivalent to $p->handler(start => sub {}), but is more -efficient. It is different from the empty-sub-handler in that -C<skipped_text> is not reset by it. - - $p->handler(start => undef); - -This causes no handler to be associated with start events. -If there is a default handler it will be invoked. - -=back - -Filters based on tags can be set up to limit the number of events -reported. The main bottleneck during parsing is often the huge number -of callbacks made from the parser. Applying filters can improve -performance significantly. - -The following methods control filters: - -=over - -=item $p->ignore_elements( @tags ) - -Both the C<start> event and the C<end> event as well as any events that -would be reported in between are suppressed. The ignored elements can -contain nested occurrences of itself. Example: - - $p->ignore_elements(qw(script style)); - -The C<script> and C<style> tags will always nest properly since their -content is parsed in CDATA mode. For most other tags -C<ignore_elements> must be used with caution since HTML is often not -I<well formed>. - -=item $p->ignore_tags( @tags ) - -Any C<start> and C<end> events involving any of the tags given are -suppressed. To reset the filter (i.e. don't suppress any C<start> and -C<end> events), call C<ignore_tags> without an argument. - -=item $p->report_tags( @tags ) - -Any C<start> and C<end> events involving any of the tags I<not> given -are suppressed. To reset the filter (i.e. report all C<start> and -C<end> events), call C<report_tags> without an argument. - -=back - -Internally, the system has two filter lists, one for C<report_tags> -and one for C<ignore_tags>, and both filters are applied. This -effectively gives C<ignore_tags> precedence over C<report_tags>. - -Examples: - - $p->ignore_tags(qw(style)); - $p->report_tags(qw(script style)); - -results in only C<script> events being reported. - -=head2 Argspec - -Argspec is a string containing a comma-separated list that describes -the information reported by the event. The following argspec -identifier names can be used: - -=over - -=item C<attr> - -Attr causes a reference to a hash of attribute name/value pairs to be -passed. - -Boolean attributes' values are either the value set by -$p->boolean_attribute_value, or the attribute name if no value has been -set by $p->boolean_attribute_value. - -This passes undef except for C<start> events. - -Unless C<xml_mode> or C<case_sensitive> is enabled, the attribute -names are forced to lower case. - -General entities are decoded in the attribute values and -one layer of matching quotes enclosing the attribute values is removed. - -The Unicode character set is assumed for entity decoding. With Perl -version 5.6 or earlier only the Latin-1 range is supported, and -entities for characters outside the range 0..255 are left unchanged. - -=item C<@attr> - -Basically the same as C<attr>, but keys and values are passed as -individual arguments and the original sequence of the attributes is -kept. The parameters passed will be the same as the @attr calculated -here: - - @attr = map { $_ => $attr->{$_} } @$attrseq; - -assuming $attr and $attrseq here are the hash and array passed as the -result of C<attr> and C<attrseq> argspecs. - -This passes no values for events besides C<start>. - -=item C<attrseq> - -Attrseq causes a reference to an array of attribute names to be -passed. This can be useful if you want to walk the C<attr> hash in -the original sequence. - -This passes undef except for C<start> events. - -Unless C<xml_mode> or C<case_sensitive> is enabled, the attribute -names are forced to lower case. - -=item C<column> - -Column causes the column number of the start of the event to be passed. -The first column on a line is 0. - -=item C<dtext> - -Dtext causes the decoded text to be passed. General entities are -automatically decoded unless the event was inside a CDATA section or -was between literal start and end tags (C<script>, C<style>, -C<xmp>, and C<plaintext>). - -The Unicode character set is assumed for entity decoding. With Perl -version 5.6 or earlier only the Latin-1 range is supported, and -entities for characters outside the range 0..255 are left unchanged. - -This passes undef except for C<text> events. - -=item C<event> - -Event causes the event name to be passed. - -The event name is one of C<text>, C<start>, C<end>, C<declaration>, -C<comment>, C<process>, C<start_document> or C<end_document>. - -=item C<is_cdata> - -Is_cdata causes a TRUE value to be passed if the event is inside a CDATA -section or between literal start and end tags (C<script>, -C<style>, C<xmp>, and C<plaintext>). - -if the flag is FALSE for a text event, then you should normally -either use C<dtext> or decode the entities yourself before the text is -processed further. - -=item C<length> - -Length causes the number of bytes of the source text of the event to -be passed. - -=item C<line> - -Line causes the line number of the start of the event to be passed. -The first line in the document is 1. Line counting doesn't start -until at least one handler requests this value to be reported. - -=item C<offset> - -Offset causes the byte position in the HTML document of the start of -the event to be passed. The first byte in the document has offset 0. - -=item C<offset_end> - -Offset_end causes the byte position in the HTML document of the end of -the event to be passed. This is the same as C<offset> + C<length>. - -=item C<self> - -Self causes the current object to be passed to the handler. If the -handler is a method, this must be the first element in the argspec. - -An alternative to passing self as an argspec is to register closures -that capture $self by themselves as handlers. Unfortunately this -creates circular references which prevent the HTML::Parser object -from being garbage collected. Using the C<self> argspec avoids this -problem. - -=item C<skipped_text> - -Skipped_text returns the concatenated text of all the events that have -been skipped since the last time an event was reported. Events might -be skipped because no handler is registered for them or because some -filter applies. Skipped text also includes marked section markup, -since there are no events that can catch it. - -If an C<"">-handler is registered for an event, then the text for this -event is not included in C<skipped_text>. Skipped text both before -and after the C<"">-event is included in the next reported -C<skipped_text>. - -=item C<tag> - -Same as C<tagname>, but prefixed with "/" if it belongs to an C<end> -event and "!" for a declaration. The C<tag> does not have any prefix -for C<start> events, and is in this case identical to C<tagname>. - -=item C<tagname> - -This is the element name (or I<generic identifier> in SGML jargon) for -start and end tags. Since HTML is case insensitive, this name is -forced to lower case to ease string matching. - -Since XML is case sensitive, the tagname case is not changed when -C<xml_mode> is enabled. The same happens if the C<case_sensitive> attribute -is set. - -The declaration type of declaration elements is also passed as a tagname, -even if that is a bit strange. -In fact, in the current implementation tagname is -identical to C<token0> except that the name may be forced to lower case. - -=item C<token0> - -Token0 causes the original text of the first token string to be -passed. This should always be the same as $tokens->[0]. - -For C<declaration> events, this is the declaration type. - -For C<start> and C<end> events, this is the tag name. - -For C<process> and non-strict C<comment> events, this is everything -inside the tag. - -This passes undef if there are no tokens in the event. - -=item C<tokenpos> - -Tokenpos causes a reference to an array of token positions to be -passed. For each string that appears in C<tokens>, this array -contains two numbers. The first number is the offset of the start of -the token in the original C<text> and the second number is the length -of the token. - -Boolean attributes in a C<start> event will have (0,0) for the -attribute value offset and length. - -This passes undef if there are no tokens in the event (e.g., C<text>) -and for artificial C<end> events triggered by empty element tags. - -If you are using these offsets and lengths to modify C<text>, you -should either work from right to left, or be very careful to calculate -the changes to the offsets. - -=item C<tokens> - -Tokens causes a reference to an array of token strings to be passed. -The strings are exactly as they were found in the original text, -no decoding or case changes are applied. - -For C<declaration> events, the array contains each word, comment, and -delimited string starting with the declaration type. - -For C<comment> events, this contains each sub-comment. If -$p->strict_comments is disabled, there will be only one sub-comment. - -For C<start> events, this contains the original tag name followed by -the attribute name/value pairs. The values of boolean attributes will -be either the value set by $p->boolean_attribute_value, or the -attribute name if no value has been set by -$p->boolean_attribute_value. - -For C<end> events, this contains the original tag name (always one token). - -For C<process> events, this contains the process instructions (always one -token). - -This passes C<undef> for C<text> events. - -=item C<text> - -Text causes the source text (including markup element delimiters) to be -passed. - -=item C<undef> - -Pass an undefined value. Useful as padding where the same handler -routine is registered for multiple events. - -=item C<'...'> - -A literal string of 0 to 255 characters enclosed -in single (') or double (") quotes is passed as entered. - -=back - -The whole argspec string can be wrapped up in C<'@{...}'> to signal -that the resulting event array should be flattened. This only makes a -difference if an array reference is used as the handler target. -Consider this example: - - $p->handler(text => [], 'text'); - $p->handler(text => [], '@{text}']); - -With two text events; C<"foo">, C<"bar">; then the first example will end -up with [["foo"], ["bar"]] and the second with ["foo", "bar"] in -the handler target array. - - -=head2 Events - -Handlers for the following events can be registered: - -=over - -=item C<comment> - -This event is triggered when a markup comment is recognized. - -Example: - - <!-- This is a comment -- -- So is this --> - -=item C<declaration> - -This event is triggered when a I<markup declaration> is recognized. - -For typical HTML documents, the only declaration you are -likely to find is <!DOCTYPE ...>. - -Example: - - <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" - "http://www.w3.org/TR/html40/strict.dtd"> - -DTDs inside <!DOCTYPE ...> will confuse HTML::Parser. - -=item C<default> - -This event is triggered for events that do not have a specific -handler. You can set up a handler for this event to catch stuff you -did not want to catch explicitly. - -=item C<end> - -This event is triggered when an end tag is recognized. - -Example: - - </A> - -=item C<end_document> - -This event is triggered when $p->eof is called and after any remaining -text is flushed. There is no document text associated with this event. - -=item C<process> - -This event is triggered when a processing instructions markup is -recognized. - -The format and content of processing instructions are system and -application dependent. - -Examples: - - <? HTML processing instructions > - <? XML processing instructions ?> - -=item C<start> - -This event is triggered when a start tag is recognized. - -Example: - - <A HREF="http://www.perl.com/"> - -=item C<start_document> - -This event is triggered before any other events for a new document. A -handler for it can be used to initialize stuff. There is no document -text associated with this event. - -=item C<text> - -This event is triggered when plain text (characters) is recognized. -The text may contain multiple lines. A sequence of text may be broken -between several text events unless $p->unbroken_text is enabled. - -The parser will make sure that it does not break a word or a sequence -of whitespace between two text events. - -=back - -=head2 Unicode - -The C<HTML::Parser> can parse Unicode strings when running under -perl-5.8 or better. If Unicode is passed to $p->parse() then chunks -of Unicode will be reported to the handlers. The offset and length -argspecs will also report their position in terms of characters. - -It is safe to parse raw undecoded UTF-8 if you either avoid decoding -entities and make sure to not use I<argspecs> that do, or enable the -C<utf8_mode> for the parser. Parsing of undecoded UTF-8 might be -useful when parsing from a file where you need the reported offsets -and lengths to match the byte offsets in the file. - -If a filename is passed to $p->parse_file() then the file will be read -in binary mode. This will be fine if the file contains only ASCII or -Latin-1 characters. If the file contains UTF-8 encoded text then care -must be taken when decoding entities as described in the previous -paragraph, but better is to open the file with the UTF-8 layer so that -it is decoded properly: - - open(my $fh, "<:utf8", "index.html") || die "...: $!"; - $p->parse_file($fh); - -If the file contains text encoded in a charset besides ASCII, Latin-1 -or UTF-8 then decoding will always be needed. - -=head1 VERSION 2 COMPATIBILITY - -When an C<HTML::Parser> object is constructed with no arguments, a set -of handlers is automatically provided that is compatible with the old -HTML::Parser version 2 callback methods. - -This is equivalent to the following method calls: - - $p->handler(start => "start", "self, tagname, attr, attrseq, text"); - $p->handler(end => "end", "self, tagname, text"); - $p->handler(text => "text", "self, text, is_cdata"); - $p->handler(process => "process", "self, token0, text"); - $p->handler(comment => - sub { - my($self, $tokens) = @_; - for (@$tokens) {$self->comment($_);}}, - "self, tokens"); - $p->handler(declaration => - sub { - my $self = shift; - $self->declaration(substr($_[0], 2, -1));}, - "self, text"); - -Setting up these handlers can also be requested with the "api_version => -2" constructor option. - -=head1 SUBCLASSING - -The C<HTML::Parser> class is subclassable. Parser objects are plain -hashes and C<HTML::Parser> reserves only hash keys that start with -"_hparser". The parser state can be set up by invoking the init() -method, which takes the same arguments as new(). - -=head1 EXAMPLES - -The first simple example shows how you might strip out comments from -an HTML document. We achieve this by setting up a comment handler that -does nothing and a default handler that will print out anything else: - - use HTML::Parser; - HTML::Parser->new(default_h => [sub { print shift }, 'text'], - comment_h => [""], - )->parse_file(shift || die) || die $!; - -An alternative implementation is: - - use HTML::Parser; - HTML::Parser->new(end_document_h => [sub { print shift }, - 'skipped_text'], - comment_h => [""], - )->parse_file(shift || die) || die $!; - -This will in most cases be much more efficient since only a single -callback will be made. - -The next example prints out the text that is inside the <title> -element of an HTML document. Here we start by setting up a start -handler. When it sees the title start tag it enables a text handler -that prints any text found and an end handler that will terminate -parsing as soon as the title end tag is seen: - - use HTML::Parser (); - - sub start_handler - { - return if shift ne "title"; - my $self = shift; - $self->handler(text => sub { print shift }, "dtext"); - $self->handler(end => sub { shift->eof if shift eq "title"; }, - "tagname,self"); - } - - my $p = HTML::Parser->new(api_version => 3); - $p->handler( start => \&start_handler, "tagname,self"); - $p->parse_file(shift || die) || die $!; - print "\n"; - -More examples are found in the F<eg/> directory of the C<HTML-Parser> -distribution: the program C<hrefsub> shows how you can edit all links -found in a document; the program C<htextsub> shows how to edit the text only; the -program C<hstrip> shows how you can strip out certain tags/elements -and/or attributes; and the program C<htext> show how to obtain the -plain text, but not any script/style content. - -You can browse the F<eg/> directory online from the I<[Browse]> link on -the http://search.cpan.org/~gaas/HTML-Parser/ page. - -=head1 BUGS - -The <style> and <script> sections do not end with the first "</", but -need the complete corresponding end tag. The standard behaviour is -not really practical. - -When the I<strict_comment> option is enabled, we still recognize -comments where there is something other than whitespace between even -and odd "--" markers. - -Once $p->boolean_attribute_value has been set, there is no way to -restore the default behaviour. - -There is currently no way to get both quote characters -into the same literal argspec. - -Empty tags, e.g. "<>" and "</>", are not recognized. SGML allows them -to repeat the previous start tag or close the previous start tag -respectively. - -NET tags, e.g. "code/.../" are not recognized. This is SGML -shorthand for "<code>...</code>". - -Unclosed start or end tags, e.g. "<tt<b>...</b</tt>" are not -recognized. - -=head1 DIAGNOSTICS - -The following messages may be produced by HTML::Parser. The notation -in this listing is the same as used in L<perldiag>: - -=over - -=item Not a reference to a hash - -(F) The object blessed into or subclassed from HTML::Parser is not a -hash as required by the HTML::Parser methods. - -=item Bad signature in parser state object at %p - -(F) The _hparser_xs_state element does not refer to a valid state structure. -Something must have changed the internal value -stored in this hash element, or the memory has been overwritten. - -=item _hparser_xs_state element is not a reference - -(F) The _hparser_xs_state element has been destroyed. - -=item Can't find '_hparser_xs_state' element in HTML::Parser hash - -(F) The _hparser_xs_state element is missing from the parser hash. -It was either deleted, or not created when the object was created. - -=item API version %s not supported by HTML::Parser %s - -(F) The constructor option 'api_version' with an argument greater than -or equal to 4 is reserved for future extensions. - -=item Bad constructor option '%s' - -(F) An unknown constructor option key was passed to the new() or -init() methods. - -=item Parse loop not allowed - -(F) A handler invoked the parse() or parse_file() method. -This is not permitted. - -=item marked sections not supported - -(F) The $p->marked_sections() method was invoked in a HTML::Parser -module that was compiled without support for marked sections. - -=item Unknown boolean attribute (%d) - -(F) Something is wrong with the internal logic that set up aliases for -boolean attributes. - -=item Only code or array references allowed as handler - -(F) The second argument for $p->handler must be either a subroutine -reference, then name of a subroutine or method, or a reference to an -array. - -=item No handler for %s events - -(F) The first argument to $p->handler must be a valid event name; i.e. one -of "start", "end", "text", "process", "declaration" or "comment". - -=item Unrecognized identifier %s in argspec - -(F) The identifier is not a known argspec name. -Use one of the names mentioned in the argspec section above. - -=item Literal string is longer than 255 chars in argspec - -(F) The current implementation limits the length of literals in -an argspec to 255 characters. Make the literal shorter. - -=item Backslash reserved for literal string in argspec - -(F) The backslash character "\" is not allowed in argspec literals. -It is reserved to permit quoting inside a literal in a later version. - -=item Unterminated literal string in argspec - -(F) The terminating quote character for a literal was not found. - -=item Bad argspec (%s) - -(F) Only identifier names, literals, spaces and commas -are allowed in argspecs. - -=item Missing comma separator in argspec - -(F) Identifiers in an argspec must be separated with ",". - -=item Parsing of undecoded UTF-8 will give garbage when decoding entities - -(W) The first chunk parsed appears to contain undecoded UTF-8 and one -or more argspecs that decode entities are used for the callback -handlers. - -The result of decoding will be a mix of encoded and decoded characters -for any entities that expand to characters with code above 127. This -is not a good thing. - -The solution is to use the Encode::encode_utf8() on the data before -feeding it to the $p->parse(). For $p->parse_file() pass a file that -has been opened in ":utf8" mode. - -The parser can process raw undecoded UTF-8 sanely if the C<utf8_mode> -is enabled or if the "attr", "@attr" or "dtext" argspecs is avoided. - -=item Parsing string decoded with wrong endianess - -(W) The first character in the document is U+FFFE. This is not a -legal Unicode character but a byte swapped BOM. The result of parsing -will likely be garbage. - -=item Parsing of undecoded UTF-32 - -(W) The parser found the Unicode UTF-32 BOM signature at the start -of the document. The result of parsing will likely be garbage. - -=item Parsing of undecoded UTF-16 - -(W) The parser found the Unicode UTF-16 BOM signature at the start of -the document. The result of parsing will likely be garbage. - -=back - -=head1 SEE ALSO - -L<HTML::Entities>, L<HTML::PullParser>, L<HTML::TokeParser>, L<HTML::HeadParser>, -L<HTML::LinkExtor>, L<HTML::Form> - -L<HTML::TreeBuilder> (part of the I<HTML-Tree> distribution) - -http://www.w3.org/TR/html4 - -More information about marked sections and processing instructions may -be found at C<http://www.sgml.u-net.com/book/sgml-8.htm>. - -=head1 COPYRIGHT - - Copyright 1996-2007 Gisle Aas. All rights reserved. - Copyright 1999-2000 Michael A. Chase. All rights reserved. - -This library is free software; you can redistribute it and/or -modify it under the same terms as Perl itself. - -=cut diff --git a/chromium/third_party/cygwin/lib/perl5/vendor_perl/5.10/i686-cygwin/HTML/PullParser.pm b/chromium/third_party/cygwin/lib/perl5/vendor_perl/5.10/i686-cygwin/HTML/PullParser.pm deleted file mode 100644 index e851fe001d4..00000000000 --- a/chromium/third_party/cygwin/lib/perl5/vendor_perl/5.10/i686-cygwin/HTML/PullParser.pm +++ /dev/null @@ -1,211 +0,0 @@ -package HTML::PullParser; - -# $Id: PullParser.pm,v 2.9 2006/04/26 08:00:28 gisle Exp $ - -require HTML::Parser; -@ISA=qw(HTML::Parser); -$VERSION = sprintf("%d.%02d", q$Revision: 2.9 $ =~ /(\d+)\.(\d+)/); - -use strict; -use Carp (); - -sub new -{ - my($class, %cnf) = @_; - - # Construct argspecs for the various events - my %argspec; - for (qw(start end text declaration comment process default)) { - my $tmp = delete $cnf{$_}; - next unless defined $tmp; - $argspec{$_} = $tmp; - } - Carp::croak("Info not collected for any events") - unless %argspec; - - my $file = delete $cnf{file}; - my $doc = delete $cnf{doc}; - Carp::croak("Can't parse from both 'doc' and 'file' at the same time") - if defined($file) && defined($doc); - Carp::croak("No 'doc' or 'file' given to parse from") - unless defined($file) || defined($doc); - - # Create object - $cnf{api_version} = 3; - my $self = $class->SUPER::new(%cnf); - - my $accum = $self->{pullparser_accum} = []; - while (my($event, $argspec) = each %argspec) { - $self->SUPER::handler($event => $accum, $argspec); - } - - if (defined $doc) { - $self->{pullparser_str_ref} = ref($doc) ? $doc : \$doc; - $self->{pullparser_str_pos} = 0; - } - else { - if (!ref($file) && ref(\$file) ne "GLOB") { - require IO::File; - $file = IO::File->new($file, "r") || return; - } - - $self->{pullparser_file} = $file; - } - $self; -} - - -sub handler -{ - Carp::croak("Can't set handlers for HTML::PullParser"); -} - - -sub get_token -{ - my $self = shift; - while (!@{$self->{pullparser_accum}} && !$self->{pullparser_eof}) { - if (my $f = $self->{pullparser_file}) { - # must try to parse more from the file - my $buf; - if (read($f, $buf, 512)) { - $self->parse($buf); - } else { - $self->eof; - $self->{pullparser_eof}++; - delete $self->{pullparser_file}; - } - } - elsif (my $sref = $self->{pullparser_str_ref}) { - # must try to parse more from the scalar - my $pos = $self->{pullparser_str_pos}; - my $chunk = substr($$sref, $pos, 512); - $self->parse($chunk); - $pos += length($chunk); - if ($pos < length($$sref)) { - $self->{pullparser_str_pos} = $pos; - } - else { - $self->eof; - $self->{pullparser_eof}++; - delete $self->{pullparser_str_ref}; - delete $self->{pullparser_str_pos}; - } - } - else { - die; - } - } - shift @{$self->{pullparser_accum}}; -} - - -sub unget_token -{ - my $self = shift; - unshift @{$self->{pullparser_accum}}, @_; - $self; -} - -1; - - -__END__ - -=head1 NAME - -HTML::PullParser - Alternative HTML::Parser interface - -=head1 SYNOPSIS - - use HTML::PullParser; - - $p = HTML::PullParser->new(file => "index.html", - start => 'event, tagname, @attr', - end => 'event, tagname', - ignore_elements => [qw(script style)], - ) || die "Can't open: $!"; - while (my $token = $p->get_token) { - #...do something with $token - } - -=head1 DESCRIPTION - -The HTML::PullParser is an alternative interface to the HTML::Parser class. -It basically turns the HTML::Parser inside out. You associate a file -(or any IO::Handle object or string) with the parser at construction time and -then repeatedly call $parser->get_token to obtain the tags and text -found in the parsed document. - -The following methods are provided: - -=over 4 - -=item $p = HTML::PullParser->new( file => $file, %options ) - -=item $p = HTML::PullParser->new( doc => \$doc, %options ) - -A C<HTML::PullParser> can be made to parse from either a file or a -literal document based on whether the C<file> or C<doc> option is -passed to the parser's constructor. - -The C<file> passed in can either be a file name or a file handle -object. If a file name is passed, and it can't be opened for reading, -then the constructor will return an undefined value and $! will tell -you why it failed. Otherwise the argument is taken to be some object -that the C<HTML::PullParser> can read() from when it needs more data. -The stream will be read() until EOF, but not closed. - -A C<doc> can be passed plain or as a reference -to a scalar. If a reference is passed then the value of this scalar -should not be changed before all tokens have been extracted. - -Next the information to be returned for the different token types must -be set up. This is done by simply associating an argspec (as defined -in L<HTML::Parser>) with the events you have an interest in. For -instance, if you want C<start> tokens to be reported as the string -C<'S'> followed by the tagname and the attributes you might pass an -C<start>-option like this: - - $p = HTML::PullParser->new( - doc => $document_to_parse, - start => '"S", tagname, @attr', - end => '"E", tagname', - ); - -At last other C<HTML::Parser> options, like C<ignore_tags>, and -C<unbroken_text>, can be passed in. Note that you should not use the -I<event>_h options to set up parser handlers. That would confuse the -inner logic of C<HTML::PullParser>. - -=item $token = $p->get_token - -This method will return the next I<token> found in the HTML document, -or C<undef> at the end of the document. The token is returned as an -array reference. The content of this array match the argspec set up -during C<HTML::PullParser> construction. - -=item $p->unget_token( @tokens ) - -If you find out you have read too many tokens you can push them back, -so that they are returned again the next time $p->get_token is called. - -=back - -=head1 EXAMPLES - -The 'eg/hform' script shows how we might parse the form section of -HTML::Documents using HTML::PullParser. - -=head1 SEE ALSO - -L<HTML::Parser>, L<HTML::TokeParser> - -=head1 COPYRIGHT - -Copyright 1998-2001 Gisle Aas. - -This library is free software; you can redistribute it and/or -modify it under the same terms as Perl itself. - -=cut diff --git a/chromium/third_party/cygwin/lib/perl5/vendor_perl/5.10/i686-cygwin/HTML/TokeParser.pm b/chromium/third_party/cygwin/lib/perl5/vendor_perl/5.10/i686-cygwin/HTML/TokeParser.pm deleted file mode 100644 index a1b8837cb4d..00000000000 --- a/chromium/third_party/cygwin/lib/perl5/vendor_perl/5.10/i686-cygwin/HTML/TokeParser.pm +++ /dev/null @@ -1,371 +0,0 @@ -package HTML::TokeParser; - -# $Id: TokeParser.pm,v 2.37 2006/04/26 08:00:28 gisle Exp $ - -require HTML::PullParser; -@ISA=qw(HTML::PullParser); -$VERSION = sprintf("%d.%02d", q$Revision: 2.37 $ =~ /(\d+)\.(\d+)/); - -use strict; -use Carp (); -use HTML::Entities qw(decode_entities); -use HTML::Tagset (); - -my %ARGS = -( - start => "'S',tagname,attr,attrseq,text", - end => "'E',tagname,text", - text => "'T',text,is_cdata", - process => "'PI',token0,text", - comment => "'C',text", - declaration => "'D',text", - - # options that default on - unbroken_text => 1, -); - - -sub new -{ - my $class = shift; - my %cnf; - if (@_ == 1) { - my $type = (ref($_[0]) eq "SCALAR") ? "doc" : "file"; - %cnf = ($type => $_[0]); - } - else { - %cnf = @_; - } - - my $textify = delete $cnf{textify} || {img => "alt", applet => "alt"}; - - my $self = $class->SUPER::new(%cnf, %ARGS) || return undef; - - $self->{textify} = $textify; - $self; -} - - -sub get_tag -{ - my $self = shift; - my $token; - while (1) { - $token = $self->get_token || return undef; - my $type = shift @$token; - next unless $type eq "S" || $type eq "E"; - substr($token->[0], 0, 0) = "/" if $type eq "E"; - return $token unless @_; - for (@_) { - return $token if $token->[0] eq $_; - } - } -} - - -sub _textify { - my($self, $token) = @_; - my $tag = $token->[1]; - return undef unless exists $self->{textify}{$tag}; - - my $alt = $self->{textify}{$tag}; - my $text; - if (ref($alt)) { - $text = &$alt(@$token); - } else { - $text = $token->[2]{$alt || "alt"}; - $text = "[\U$tag]" unless defined $text; - } - return $text; -} - - -sub get_text -{ - my $self = shift; - my @text; - while (my $token = $self->get_token) { - my $type = $token->[0]; - if ($type eq "T") { - my $text = $token->[1]; - decode_entities($text) unless $token->[2]; - push(@text, $text); - } elsif ($type =~ /^[SE]$/) { - my $tag = $token->[1]; - if ($type eq "S") { - if (defined(my $text = _textify($self, $token))) { - push(@text, $text); - next; - } - } else { - $tag = "/$tag"; - } - if (!@_ || grep $_ eq $tag, @_) { - $self->unget_token($token); - last; - } - push(@text, " ") - if $tag eq "br" || !$HTML::Tagset::isPhraseMarkup{$token->[1]}; - } - } - join("", @text); -} - - -sub get_trimmed_text -{ - my $self = shift; - my $text = $self->get_text(@_); - $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g; - $text; -} - -sub get_phrase { - my $self = shift; - my @text; - while (my $token = $self->get_token) { - my $type = $token->[0]; - if ($type eq "T") { - my $text = $token->[1]; - decode_entities($text) unless $token->[2]; - push(@text, $text); - } elsif ($type =~ /^[SE]$/) { - my $tag = $token->[1]; - if ($type eq "S") { - if (defined(my $text = _textify($self, $token))) { - push(@text, $text); - next; - } - } - if (!$HTML::Tagset::isPhraseMarkup{$tag}) { - $self->unget_token($token); - last; - } - push(@text, " ") if $tag eq "br"; - } - } - my $text = join("", @text); - $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g; - $text; -} - -1; - - -__END__ - -=head1 NAME - -HTML::TokeParser - Alternative HTML::Parser interface - -=head1 SYNOPSIS - - require HTML::TokeParser; - $p = HTML::TokeParser->new("index.html") || - die "Can't open: $!"; - $p->empty_element_tags(1); # configure its behaviour - - while (my $token = $p->get_token) { - #... - } - -=head1 DESCRIPTION - -The C<HTML::TokeParser> is an alternative interface to the -C<HTML::Parser> class. It is an C<HTML::PullParser> subclass with a -predeclared set of token types. If you wish the tokens to be reported -differently you probably want to use the C<HTML::PullParser> directly. - -The following methods are available: - -=over 4 - -=item $p = HTML::TokeParser->new( $filename, %opt ); - -=item $p = HTML::TokeParser->new( $filehandle, %opt ); - -=item $p = HTML::TokeParser->new( \$document, %opt ); - -The object constructor argument is either a file name, a file handle -object, or the complete document to be parsed. Extra options can be -provided as key/value pairs and are processed as documented by the base -classes. - -If the argument is a plain scalar, then it is taken as the name of a -file to be opened and parsed. If the file can't be opened for -reading, then the constructor will return C<undef> and $! will tell -you why it failed. - -If the argument is a reference to a plain scalar, then this scalar is -taken to be the literal document to parse. The value of this -scalar should not be changed before all tokens have been extracted. - -Otherwise the argument is taken to be some object that the -C<HTML::TokeParser> can read() from when it needs more data. Typically -it will be a filehandle of some kind. The stream will be read() until -EOF, but not closed. - -A newly constructed C<HTML::TokeParser> differ from its base classes -by having the C<unbroken_text> attribute enabled by default. See -L<HTML::Parser> for a description of this and other attributes that -influence how the document is parsed. It is often a good idea to enable -C<empty_element_tags> behaviour. - -Note that the parsing result will likely not be valid if raw undecoded -UTF-8 is used as a source. When parsing UTF-8 encoded files turn -on UTF-8 decoding: - - open(my $fh, "<:utf8", "index.html") || die "Can't open 'index.html': $!"; - my $p = HTML::TokeParser->new( $fh ); - # ... - -If a $filename is passed to the constructor the file will be opened in -raw mode and the parsing result will only be valid if its content is -Latin-1 or pure ASCII. - -If parsing from an UTF-8 encoded string buffer decode it first: - - utf8::decode($document); - my $p = HTML::TokeParser->new( \$document ); - # ... - -=item $p->get_token - -This method will return the next I<token> found in the HTML document, -or C<undef> at the end of the document. The token is returned as an -array reference. The first element of the array will be a string -denoting the type of this token: "S" for start tag, "E" for end tag, -"T" for text, "C" for comment, "D" for declaration, and "PI" for -process instructions. The rest of the token array depend on the type -like this: - - ["S", $tag, $attr, $attrseq, $text] - ["E", $tag, $text] - ["T", $text, $is_data] - ["C", $text] - ["D", $text] - ["PI", $token0, $text] - -where $attr is a hash reference, $attrseq is an array reference and -the rest are plain scalars. The L<HTML::Parser/Argspec> explains the -details. - -=item $p->unget_token( @tokens ) - -If you find you have read too many tokens you can push them back, -so that they are returned the next time $p->get_token is called. - -=item $p->get_tag - -=item $p->get_tag( @tags ) - -This method returns the next start or end tag (skipping any other -tokens), or C<undef> if there are no more tags in the document. If -one or more arguments are given, then we skip tokens until one of the -specified tag types is found. For example: - - $p->get_tag("font", "/font"); - -will find the next start or end tag for a font-element. - -The tag information is returned as an array reference in the same form -as for $p->get_token above, but the type code (first element) is -missing. A start tag will be returned like this: - - [$tag, $attr, $attrseq, $text] - -The tagname of end tags are prefixed with "/", i.e. end tag is -returned like this: - - ["/$tag", $text] - -=item $p->get_text - -=item $p->get_text( @endtags ) - -This method returns all text found at the current position. It will -return a zero length string if the next token is not text. Any -entities will be converted to their corresponding character. - -If one or more arguments are given, then we return all text occurring -before the first of the specified tags found. For example: - - $p->get_text("p", "br"); - -will return the text up to either a paragraph of linebreak element. - -The text might span tags that should be I<textified>. This is -controlled by the $p->{textify} attribute, which is a hash that -defines how certain tags can be treated as text. If the name of a -start tag matches a key in this hash then this tag is converted to -text. The hash value is used to specify which tag attribute to obtain -the text from. If this tag attribute is missing, then the upper case -name of the tag enclosed in brackets is returned, e.g. "[IMG]". The -hash value can also be a subroutine reference. In this case the -routine is called with the start tag token content as its argument and -the return value is treated as the text. - -The default $p->{textify} value is: - - {img => "alt", applet => "alt"} - -This means that <IMG> and <APPLET> tags are treated as text, and that -the text to substitute can be found in the ALT attribute. - -=item $p->get_trimmed_text - -=item $p->get_trimmed_text( @endtags ) - -Same as $p->get_text above, but will collapse any sequences of white -space to a single space character. Leading and trailing white space is -removed. - -=item $p->get_phrase - -This will return all text found at the current position ignoring any -phrasal-level tags. Text is extracted until the first non -phrasal-level tag. Textification of tags is the same as for -get_text(). This method will collapse white space in the same way as -get_trimmed_text() does. - -The definition of <i>phrasal-level tags</i> is obtained from the -HTML::Tagset module. - -=back - -=head1 EXAMPLES - -This example extracts all links from a document. It will print one -line for each link, containing the URL and the textual description -between the <A>...</A> tags: - - use HTML::TokeParser; - $p = HTML::TokeParser->new(shift||"index.html"); - - while (my $token = $p->get_tag("a")) { - my $url = $token->[1]{href} || "-"; - my $text = $p->get_trimmed_text("/a"); - print "$url\t$text\n"; - } - -This example extract the <TITLE> from the document: - - use HTML::TokeParser; - $p = HTML::TokeParser->new(shift||"index.html"); - if ($p->get_tag("title")) { - my $title = $p->get_trimmed_text; - print "Title: $title\n"; - } - -=head1 SEE ALSO - -L<HTML::PullParser>, L<HTML::Parser> - -=head1 COPYRIGHT - -Copyright 1998-2005 Gisle Aas. - -This library is free software; you can redistribute it and/or -modify it under the same terms as Perl itself. - -=cut |