diff options
Diffstat (limited to 'chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/encoding.pm')
-rw-r--r-- | chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/encoding.pm | 669 |
1 files changed, 0 insertions, 669 deletions
diff --git a/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/encoding.pm b/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/encoding.pm deleted file mode 100644 index 77ba4472a23..00000000000 --- a/chromium/third_party/cygwin/lib/perl5/5.10/i686-cygwin/encoding.pm +++ /dev/null @@ -1,669 +0,0 @@ -# $Id: encoding.pm,v 2.7 2008/03/12 09:51:11 dankogai Exp $ -package encoding; -our $VERSION = '2.6_01'; - -use Encode; -use strict; -use warnings; - -sub DEBUG () { 0 } - -BEGIN { - if ( ord("A") == 193 ) { - require Carp; - Carp::croak("encoding: pragma does not support EBCDIC platforms"); - } -} - -our $HAS_PERLIO = 0; -eval { require PerlIO::encoding }; -unless ($@) { - $HAS_PERLIO = ( PerlIO::encoding->VERSION >= 0.02 ); -} - -sub _exception { - my $name = shift; - $] > 5.008 and return 0; # 5.8.1 or higher then no - my %utfs = map { $_ => 1 } - qw(utf8 UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE - UTF-32 UTF-32BE UTF-32LE); - $utfs{$name} or return 0; # UTFs or no - require Config; - Config->import(); - our %Config; - return $Config{perl_patchlevel} ? 0 : 1 # maintperl then no -} - -sub in_locale { $^H & ( $locale::hint_bits || 0 ) } - -sub _get_locale_encoding { - my $locale_encoding; - - # I18N::Langinfo isn't available everywhere - eval { - require I18N::Langinfo; - I18N::Langinfo->import(qw(langinfo CODESET)); - $locale_encoding = langinfo( CODESET() ); - }; - - my $country_language; - - no warnings 'uninitialized'; - - if ( (not $locale_encoding) && in_locale() ) { - if ( $ENV{LC_ALL} =~ /^([^.]+)\.([^.@]+)(@.*)?$/ ) { - ( $country_language, $locale_encoding ) = ( $1, $2 ); - } - elsif ( $ENV{LANG} =~ /^([^.]+)\.([^.@]+)(@.*)?$/ ) { - ( $country_language, $locale_encoding ) = ( $1, $2 ); - } - - # LANGUAGE affects only LC_MESSAGES only on glibc - } - elsif ( not $locale_encoding ) { - if ( $ENV{LC_ALL} =~ /\butf-?8\b/i - || $ENV{LANG} =~ /\butf-?8\b/i ) - { - $locale_encoding = 'utf8'; - } - - # Could do more heuristics based on the country and language - # parts of LC_ALL and LANG (the parts before the dot (if any)), - # since we have Locale::Country and Locale::Language available. - # TODO: get a database of Language -> Encoding mappings - # (the Estonian database at http://www.eki.ee/letter/ - # would be excellent!) --jhi - } - if ( defined $locale_encoding - && lc($locale_encoding) eq 'euc' - && defined $country_language ) - { - if ( $country_language =~ /^ja_JP|japan(?:ese)?$/i ) { - $locale_encoding = 'euc-jp'; - } - elsif ( $country_language =~ /^ko_KR|korean?$/i ) { - $locale_encoding = 'euc-kr'; - } - elsif ( $country_language =~ /^zh_CN|chin(?:a|ese)$/i ) { - $locale_encoding = 'euc-cn'; - } - elsif ( $country_language =~ /^zh_TW|taiwan(?:ese)?$/i ) { - $locale_encoding = 'euc-tw'; - } - else { - require Carp; - Carp::croak( - "encoding: Locale encoding '$locale_encoding' too ambiguous" - ); - } - } - - return $locale_encoding; -} - -sub import { - my $class = shift; - my $name = shift; - if ( $name eq ':_get_locale_encoding' ) { # used by lib/open.pm - my $caller = caller(); - { - no strict 'refs'; - *{"${caller}::_get_locale_encoding"} = \&_get_locale_encoding; - } - return; - } - $name = _get_locale_encoding() if $name eq ':locale'; - my %arg = @_; - $name = $ENV{PERL_ENCODING} unless defined $name; - my $enc = find_encoding($name); - unless ( defined $enc ) { - require Carp; - Carp::croak("encoding: Unknown encoding '$name'"); - } - $name = $enc->name; # canonize - unless ( $arg{Filter} ) { - DEBUG and warn "_exception($name) = ", _exception($name); - _exception($name) or ${^ENCODING} = $enc; - $HAS_PERLIO or return 1; - } - else { - defined( ${^ENCODING} ) and undef ${^ENCODING}; - - # implicitly 'use utf8' - require utf8; # to fetch $utf8::hint_bits; - $^H |= $utf8::hint_bits; - eval { - require Filter::Util::Call; - Filter::Util::Call->import; - filter_add( - sub { - my $status = filter_read(); - if ( $status > 0 ) { - $_ = $enc->decode( $_, 1 ); - DEBUG and warn $_; - } - $status; - } - ); - }; - $@ eq '' and DEBUG and warn "Filter installed"; - } - defined ${^UNICODE} and ${^UNICODE} != 0 and return 1; - for my $h (qw(STDIN STDOUT)) { - if ( $arg{$h} ) { - unless ( defined find_encoding( $arg{$h} ) ) { - require Carp; - Carp::croak( - "encoding: Unknown encoding for $h, '$arg{$h}'"); - } - eval { binmode( $h, ":raw :encoding($arg{$h})" ) }; - } - else { - unless ( exists $arg{$h} ) { - eval { - no warnings 'uninitialized'; - binmode( $h, ":raw :encoding($name)" ); - }; - } - } - if ($@) { - require Carp; - Carp::croak($@); - } - } - return 1; # I doubt if we need it, though -} - -sub unimport { - no warnings; - undef ${^ENCODING}; - if ($HAS_PERLIO) { - binmode( STDIN, ":raw" ); - binmode( STDOUT, ":raw" ); - } - else { - binmode(STDIN); - binmode(STDOUT); - } - if ( $INC{"Filter/Util/Call.pm"} ) { - eval { filter_del() }; - } -} - -1; -__END__ - -=pod - -=head1 NAME - -encoding - allows you to write your script in non-ascii or non-utf8 - -=head1 SYNOPSIS - - use encoding "greek"; # Perl like Greek to you? - use encoding "euc-jp"; # Jperl! - - # or you can even do this if your shell supports your native encoding - - perl -Mencoding=latin2 -e '...' # Feeling centrally European? - perl -Mencoding=euc-kr -e '...' # Or Korean? - - # more control - - # A simple euc-cn => utf-8 converter - use encoding "euc-cn", STDOUT => "utf8"; while(<>){print}; - - # "no encoding;" supported (but not scoped!) - no encoding; - - # an alternate way, Filter - use encoding "euc-jp", Filter=>1; - # now you can use kanji identifiers -- in euc-jp! - - # switch on locale - - # note that this probably means that unless you have a complete control - # over the environments the application is ever going to be run, you should - # NOT use the feature of encoding pragma allowing you to write your script - # in any recognized encoding because changing locale settings will wreck - # the script; you can of course still use the other features of the pragma. - use encoding ':locale'; - -=head1 ABSTRACT - -Let's start with a bit of history: Perl 5.6.0 introduced Unicode -support. You could apply C<substr()> and regexes even to complex CJK -characters -- so long as the script was written in UTF-8. But back -then, text editors that supported UTF-8 were still rare and many users -instead chose to write scripts in legacy encodings, giving up a whole -new feature of Perl 5.6. - -Rewind to the future: starting from perl 5.8.0 with the B<encoding> -pragma, you can write your script in any encoding you like (so long -as the C<Encode> module supports it) and still enjoy Unicode support. -This pragma achieves that by doing the following: - -=over - -=item * - -Internally converts all literals (C<q//,qq//,qr//,qw///, qx//>) from -the encoding specified to utf8. In Perl 5.8.1 and later, literals in -C<tr///> and C<DATA> pseudo-filehandle are also converted. - -=item * - -Changing PerlIO layers of C<STDIN> and C<STDOUT> to the encoding - specified. - -=back - -=head2 Literal Conversions - -You can write code in EUC-JP as follows: - - my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji - #<-char-><-char-> # 4 octets - s/\bCamel\b/$Rakuda/; - -And with C<use encoding "euc-jp"> in effect, it is the same thing as -the code in UTF-8: - - my $Rakuda = "\x{99F1}\x{99DD}"; # two Unicode Characters - s/\bCamel\b/$Rakuda/; - -=head2 PerlIO layers for C<STD(IN|OUT)> - -The B<encoding> pragma also modifies the filehandle layers of -STDIN and STDOUT to the specified encoding. Therefore, - - use encoding "euc-jp"; - my $message = "Camel is the symbol of perl.\n"; - my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji - $message =~ s/\bCamel\b/$Rakuda/; - print $message; - -Will print "\xF1\xD1\xF1\xCC is the symbol of perl.\n", -not "\x{99F1}\x{99DD} is the symbol of perl.\n". - -You can override this by giving extra arguments; see below. - -=head2 Implicit upgrading for byte strings - -By default, if strings operating under byte semantics and strings -with Unicode character data are concatenated, the new string will -be created by decoding the byte strings as I<ISO 8859-1 (Latin-1)>. - -The B<encoding> pragma changes this to use the specified encoding -instead. For example: - - use encoding 'utf8'; - my $string = chr(20000); # a Unicode string - utf8::encode($string); # now it's a UTF-8 encoded byte string - # concatenate with another Unicode string - print length($string . chr(20000)); - -Will print C<2>, because C<$string> is upgraded as UTF-8. Without -C<use encoding 'utf8';>, it will print C<4> instead, since C<$string> -is three octets when interpreted as Latin-1. - -=head2 Side effects - -If the C<encoding> pragma is in scope then the lengths returned are -calculated from the length of C<$/> in Unicode characters, which is not -always the same as the length of C<$/> in the native encoding. - -This pragma affects utf8::upgrade, but not utf8::downgrade. - -=head1 FEATURES THAT REQUIRE 5.8.1 - -Some of the features offered by this pragma requires perl 5.8.1. Most -of these are done by Inaba Hiroto. Any other features and changes -are good for 5.8.0. - -=over - -=item "NON-EUC" doublebyte encodings - -Because perl needs to parse script before applying this pragma, such -encodings as Shift_JIS and Big-5 that may contain '\' (BACKSLASH; -\x5c) in the second byte fails because the second byte may -accidentally escape the quoting character that follows. Perl 5.8.1 -or later fixes this problem. - -=item tr// - -C<tr//> was overlooked by Perl 5 porters when they released perl 5.8.0 -See the section below for details. - -=item DATA pseudo-filehandle - -Another feature that was overlooked was C<DATA>. - -=back - -=head1 USAGE - -=over 4 - -=item use encoding [I<ENCNAME>] ; - -Sets the script encoding to I<ENCNAME>. And unless ${^UNICODE} -exists and non-zero, PerlIO layers of STDIN and STDOUT are set to -":encoding(I<ENCNAME>)". - -Note that STDERR WILL NOT be changed. - -Also note that non-STD file handles remain unaffected. Use C<use -open> or C<binmode> to change layers of those. - -If no encoding is specified, the environment variable L<PERL_ENCODING> -is consulted. If no encoding can be found, the error C<Unknown encoding -'I<ENCNAME>'> will be thrown. - -=item use encoding I<ENCNAME> [ STDIN =E<gt> I<ENCNAME_IN> ...] ; - -You can also individually set encodings of STDIN and STDOUT via the -C<< STDIN => I<ENCNAME> >> form. In this case, you cannot omit the -first I<ENCNAME>. C<< STDIN => undef >> turns the IO transcoding -completely off. - -When ${^UNICODE} exists and non-zero, these options will completely -ignored. ${^UNICODE} is a variable introduced in perl 5.8.1. See -L<perlrun> see L<perlvar/"${^UNICODE}"> and L<perlrun/"-C"> for -details (perl 5.8.1 and later). - -=item use encoding I<ENCNAME> Filter=E<gt>1; - -This turns the encoding pragma into a source filter. While the -default approach just decodes interpolated literals (in qq() and -qr()), this will apply a source filter to the entire source code. See -L</"The Filter Option"> below for details. - -=item no encoding; - -Unsets the script encoding. The layers of STDIN, STDOUT are -reset to ":raw" (the default unprocessed raw stream of bytes). - -=back - -=head1 The Filter Option - -The magic of C<use encoding> is not applied to the names of -identifiers. In order to make C<${"\x{4eba}"}++> ($human++, where human -is a single Han ideograph) work, you still need to write your script -in UTF-8 -- or use a source filter. That's what 'Filter=>1' does. - -What does this mean? Your source code behaves as if it is written in -UTF-8 with 'use utf8' in effect. So even if your editor only supports -Shift_JIS, for example, you can still try examples in Chapter 15 of -C<Programming Perl, 3rd Ed.>. For instance, you can use UTF-8 -identifiers. - -This option is significantly slower and (as of this writing) non-ASCII -identifiers are not very stable WITHOUT this option and with the -source code written in UTF-8. - -=head2 Filter-related changes at Encode version 1.87 - -=over - -=item * - -The Filter option now sets STDIN and STDOUT like non-filter options. -And C<< STDIN=>I<ENCODING> >> and C<< STDOUT=>I<ENCODING> >> work like -non-filter version. - -=item * - -C<use utf8> is implicitly declared so you no longer have to C<use -utf8> to C<${"\x{4eba}"}++>. - -=back - -=head1 CAVEATS - -=head2 NOT SCOPED - -The pragma is a per script, not a per block lexical. Only the last -C<use encoding> or C<no encoding> matters, and it affects -B<the whole script>. However, the <no encoding> pragma is supported and -B<use encoding> can appear as many times as you want in a given script. -The multiple use of this pragma is discouraged. - -By the same reason, the use this pragma inside modules is also -discouraged (though not as strongly discouraged as the case above. -See below). - -If you still have to write a module with this pragma, be very careful -of the load order. See the codes below; - - # called module - package Module_IN_BAR; - use encoding "bar"; - # stuff in "bar" encoding here - 1; - - # caller script - use encoding "foo" - use Module_IN_BAR; - # surprise! use encoding "bar" is in effect. - -The best way to avoid this oddity is to use this pragma RIGHT AFTER -other modules are loaded. i.e. - - use Module_IN_BAR; - use encoding "foo"; - -=head2 DO NOT MIX MULTIPLE ENCODINGS - -Notice that only literals (string or regular expression) having only -legacy code points are affected: if you mix data like this - - \xDF\x{100} - -the data is assumed to be in (Latin 1 and) Unicode, not in your native -encoding. In other words, this will match in "greek": - - "\xDF" =~ /\x{3af}/ - -but this will not - - "\xDF\x{100}" =~ /\x{3af}\x{100}/ - -since the C<\xDF> (ISO 8859-7 GREEK SMALL LETTER IOTA WITH TONOS) on -the left will B<not> be upgraded to C<\x{3af}> (Unicode GREEK SMALL -LETTER IOTA WITH TONOS) because of the C<\x{100}> on the left. You -should not be mixing your legacy data and Unicode in the same string. - -This pragma also affects encoding of the 0x80..0xFF code point range: -normally characters in that range are left as eight-bit bytes (unless -they are combined with characters with code points 0x100 or larger, -in which case all characters need to become UTF-8 encoded), but if -the C<encoding> pragma is present, even the 0x80..0xFF range always -gets UTF-8 encoded. - -After all, the best thing about this pragma is that you don't have to -resort to \x{....} just to spell your name in a native encoding. -So feel free to put your strings in your encoding in quotes and -regexes. - -=head2 tr/// with ranges - -The B<encoding> pragma works by decoding string literals in -C<q//,qq//,qr//,qw///, qx//> and so forth. In perl 5.8.0, this -does not apply to C<tr///>. Therefore, - - use encoding 'euc-jp'; - #.... - $kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/; - # -------- -------- -------- -------- - -Does not work as - - $kana =~ tr/\x{3041}-\x{3093}/\x{30a1}-\x{30f3}/; - -=over - -=item Legend of characters above - - utf8 euc-jp charnames::viacode() - ----------------------------------------- - \x{3041} \xA4\xA1 HIRAGANA LETTER SMALL A - \x{3093} \xA4\xF3 HIRAGANA LETTER N - \x{30a1} \xA5\xA1 KATAKANA LETTER SMALL A - \x{30f3} \xA5\xF3 KATAKANA LETTER N - -=back - -This counterintuitive behavior has been fixed in perl 5.8.1. - -=head3 workaround to tr///; - -In perl 5.8.0, you can work around as follows; - - use encoding 'euc-jp'; - # .... - eval qq{ \$kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/ }; - -Note the C<tr//> expression is surrounded by C<qq{}>. The idea behind -is the same as classic idiom that makes C<tr///> 'interpolate'. - - tr/$from/$to/; # wrong! - eval qq{ tr/$from/$to/ }; # workaround. - -Nevertheless, in case of B<encoding> pragma even C<q//> is affected so -C<tr///> not being decoded was obviously against the will of Perl5 -Porters so it has been fixed in Perl 5.8.1 or later. - -=head1 EXAMPLE - Greekperl - - use encoding "iso 8859-7"; - - # \xDF in ISO 8859-7 (Greek) is \x{3af} in Unicode. - - $a = "\xDF"; - $b = "\x{100}"; - - printf "%#x\n", ord($a); # will print 0x3af, not 0xdf - - $c = $a . $b; - - # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}". - - # chr() is affected, and ... - - print "mega\n" if ord(chr(0xdf)) == 0x3af; - - # ... ord() is affected by the encoding pragma ... - - print "tera\n" if ord(pack("C", 0xdf)) == 0x3af; - - # ... as are eq and cmp ... - - print "peta\n" if "\x{3af}" eq pack("C", 0xdf); - print "exa\n" if "\x{3af}" cmp pack("C", 0xdf) == 0; - - # ... but pack/unpack C are not affected, in case you still - # want to go back to your native encoding - - print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf; - -=head1 KNOWN PROBLEMS - -=over - -=item literals in regex that are longer than 127 bytes - -For native multibyte encodings (either fixed or variable length), -the current implementation of the regular expressions may introduce -recoding errors for regular expression literals longer than 127 bytes. - -=item EBCDIC - -The encoding pragma is not supported on EBCDIC platforms. -(Porters who are willing and able to remove this limitation are -welcome.) - -=item format - -This pragma doesn't work well with format because PerlIO does not -get along very well with it. When format contains non-ascii -characters it prints funny or gets "wide character warnings". -To understand it, try the code below. - - # Save this one in utf8 - # replace *non-ascii* with a non-ascii string - my $camel; - format STDOUT = - *non-ascii*@>>>>>>> - $camel - . - $camel = "*non-ascii*"; - binmode(STDOUT=>':encoding(utf8)'); # bang! - write; # funny - print $camel, "\n"; # fine - -Without binmode this happens to work but without binmode, print() -fails instead of write(). - -At any rate, the very use of format is questionable when it comes to -unicode characters since you have to consider such things as character -width (i.e. double-width for ideographs) and directions (i.e. BIDI for -Arabic and Hebrew). - -=item Thread safety - -C<use encoding ...> is not thread-safe (i.e., do not use in threaded -applications). - -=back - -=head2 The Logic of :locale - -The logic of C<:locale> is as follows: - -=over 4 - -=item 1. - -If the platform supports the langinfo(CODESET) interface, the codeset -returned is used as the default encoding for the open pragma. - -=item 2. - -If 1. didn't work but we are under the locale pragma, the environment -variables LC_ALL and LANG (in that order) are matched for encodings -(the part after C<.>, if any), and if any found, that is used -as the default encoding for the open pragma. - -=item 3. - -If 1. and 2. didn't work, the environment variables LC_ALL and LANG -(in that order) are matched for anything looking like UTF-8, and if -any found, C<:utf8> is used as the default encoding for the open -pragma. - -=back - -If your locale environment variables (LC_ALL, LC_CTYPE, LANG) -contain the strings 'UTF-8' or 'UTF8' (case-insensitive matching), -the default encoding of your STDIN, STDOUT, and STDERR, and of -B<any subsequent file open>, is UTF-8. - -=head1 HISTORY - -This pragma first appeared in Perl 5.8.0. For features that require -5.8.1 and better, see above. - -The C<:locale> subpragma was implemented in 2.01, or Perl 5.8.6. - -=head1 SEE ALSO - -L<perlunicode>, L<Encode>, L<open>, L<Filter::Util::Call>, - -Ch. 15 of C<Programming Perl (3rd Edition)> -by Larry Wall, Tom Christiansen, Jon Orwant; -O'Reilly & Associates; ISBN 0-596-00027-8 - -=cut |