#! /usr/bin/perl -w # -*- Perl -*- # # afblue.pl # # Process a blue zone character data file. # # Copyright (C) 2013-2019 by # David Turner, Robert Wilhelm, and Werner Lemberg. # # This file is part of the FreeType project, and may only be used, # modified, and distributed under the terms of the FreeType project # license, LICENSE.TXT. By continuing to use, modify, or distribute # this file you indicate that you have read the license and # understand and accept it fully. use strict; use warnings; use English '-no_match_vars'; use open ':std', ':encoding(UTF-8)'; my $prog = $PROGRAM_NAME; $prog =~ s| .* / ||x; # Remove path. die "usage: $prog datafile < infile > outfile\n" if $#ARGV != 0; my $datafile = $ARGV[0]; my %diversions; # The extracted and massaged data from `datafile'. my @else_stack; # Booleans to track else-clauses. my @name_stack; # Stack of integers used for names of aux. variables. my $curr_enum; # Name of the current enumeration. my $curr_array; # Name of the current array. my $curr_max; # Name of the current maximum value. my $curr_enum_element; # Name of the current enumeration element. my $curr_offset; # The offset relative to current aux. variable. my $curr_elem_size; # The number of non-space characters in the current string or # the number of elements in the current block. my $have_sections = 0; # Boolean; set if start of a section has been seen. my $have_strings; # Boolean; set if current section contains strings. my $have_blocks; # Boolean; set if current section contains blocks. my $have_enum_element; # Boolean; set if we have an enumeration element. my $in_string; # Boolean; set if a string has been parsed. my $num_sections = 0; # Number of sections seen so far. my $last_aux; # Name of last auxiliary variable. # Regular expressions. # [] [] ':' [] '\n' my $section_re = qr/ ^ \s* (\S+) \s+ (\S+) \s+ (\S+) \s* : \s* $ /x; # [] [] '\n' my $enum_element_re = qr/ ^ \s* ( [A-Za-z0-9_]+ ) \s* $ /x; # '#' '\n' my $preprocessor_re = qr/ ^ \# /x; # [] '/' '/' '\n' my $comment_re = qr| ^ \s* // |x; # empty line my $whitespace_only_re = qr/ ^ \s* $ /x; # [] '"' '"' [] '\n' ( doesn't contain newlines) my $string_re = qr/ ^ \s* " ( (?> (?: (?> [^"\\]+ ) | \\. )* ) ) " \s* $ /x; # [] '{' '}' [] '\n' ( can contain newlines) my $block_start_re = qr/ ^ \s* \{ /x; # We need the capturing group for `split' to make it return the separator # tokens (i.e., the opening and closing brace) also. my $brace_re = qr/ ( [{}] ) /x; sub Warn { my $message = shift; warn "$datafile:$INPUT_LINE_NUMBER: warning: $message\n"; } sub Die { my $message = shift; die "$datafile:$INPUT_LINE_NUMBER: error: $message\n"; } my $warned_before = 0; sub warn_before { Warn("data before first section gets ignored") unless $warned_before; $warned_before = 1; } sub strip_newline { chomp; s/ \x0D $ //x; } sub end_curr_string { # Append final null byte to string. if ($have_strings) { push @{$diversions{$curr_array}}, " '\\0',\n" if $in_string; $curr_offset++; $in_string = 0; } } sub update_max_elem_size { if ($curr_elem_size) { my $max = pop @{$diversions{$curr_max}}; $max = $curr_elem_size if $curr_elem_size > $max; push @{$diversions{$curr_max}}, $max; } } sub convert_non_ascii_char { # A UTF-8 character outside of the printable ASCII range, with possibly a # leading backslash character. my $s = shift; # Here we count characters, not bytes. $curr_elem_size += length $s; utf8::encode($s); $s = uc unpack 'H*', $s; $curr_offset += $s =~ s/\G(..)/'\\x$1', /sg; return $s; } sub convert_ascii_chars { # A series of ASCII characters in the printable range. my $s = shift; # We reduce multiple space characters to a single one. $s =~ s/ +/ /g; # Count all non-space characters. Note that `()' applies a list context # to the capture that is used to count the elements. $curr_elem_size += () = $s =~ /[^ ]/g; $curr_offset += $s =~ s/\G(.)/'$1', /g; return $s; } sub convert_literal { my $s = shift; my $orig = $s; # ASCII printables and space my $safe_re = '\x20-\x7E'; # ASCII printables and space, no backslash my $safe_no_backslash_re = '\x20-\x5B\x5D-\x7E'; $s =~ s{ (?: \\? ( [^$safe_re] ) | ( (?: [$safe_no_backslash_re] | \\ [$safe_re] )+ ) ) } { defined($1) ? convert_non_ascii_char($1) : convert_ascii_chars($2) }egx; # We assume that `$orig' doesn't contain `*/' return $s . " /* $orig */"; } sub aux_name { return "af_blue_" . $num_sections. "_" . join('_', @name_stack); } sub aux_name_next { $name_stack[$#name_stack]++; my $name = aux_name(); $name_stack[$#name_stack]--; return $name; } sub enum_val_string { # Build string that holds code to save the current offset in an # enumeration element. my $aux = shift; my $add = ($last_aux eq "af_blue_" . $num_sections . "_0" ) ? "" : "$last_aux + "; return " $aux = $add$curr_offset,\n"; } # Process data file. open(DATA, $datafile) || die "$prog: can't open \`$datafile': $OS_ERROR\n"; while () { strip_newline(); next if /$comment_re/; next if /$whitespace_only_re/; if (/$section_re/) { Warn("previous section is empty") if ($have_sections && !$have_strings && !$have_blocks); end_curr_string(); update_max_elem_size(); # Save captured groups from `section_re'. $curr_enum = $1; $curr_array = $2; $curr_max = $3; $curr_enum_element = ""; $curr_offset = 0; Warn("overwriting already defined enumeration \`$curr_enum'") if exists($diversions{$curr_enum}); Warn("overwriting already defined array \`$curr_array'") if exists($diversions{$curr_array}); Warn("overwriting already defined maximum value \`$curr_max'") if exists($diversions{$curr_max}); $diversions{$curr_enum} = []; $diversions{$curr_array} = []; $diversions{$curr_max} = []; push @{$diversions{$curr_max}}, 0; @name_stack = (); push @name_stack, 0; $have_sections = 1; $have_strings = 0; $have_blocks = 0; $have_enum_element = 0; $in_string = 0; $num_sections++; $curr_elem_size = 0; $last_aux = aux_name(); next; } if (/$preprocessor_re/) { if ($have_sections) { # Having preprocessor conditionals complicates the computation of # correct offset values. We have to introduce auxiliary enumeration # elements with the name `af_blue____...' that store # offsets to be used in conditional clauses. `' is the number of # sections seen so far, `' is the number of `#if' and `#endif' # conditionals seen so far in the topmost level, `' the number of # `#if' and `#endif' conditionals seen so far one level deeper, etc. # As a consequence, uneven values are used within a clause, and even # values after a clause, since the C standard doesn't allow the # redefinition of an enumeration value. For example, the name # `af_blue_5_1_6' is used to construct enumeration values in the fifth # section after the third (second-level) if-clause within the first # (top-level) if-clause. After the first top-level clause has # finished, `af_blue_5_2' is used. The current offset is then # relative to the value stored in the current auxiliary element. if (/ ^ \# \s* if /x) { push @else_stack, 0; $name_stack[$#name_stack]++; push @{$diversions{$curr_enum}}, enum_val_string(aux_name()); $last_aux = aux_name(); push @name_stack, 0; $curr_offset = 0; } elsif (/ ^ \# \s* elif /x) { Die("unbalanced #elif") unless @else_stack; pop @name_stack; push @{$diversions{$curr_enum}}, enum_val_string(aux_name_next()); $last_aux = aux_name(); push @name_stack, 0; $curr_offset = 0; } elsif (/ ^ \# \s* else /x) { my $prev_else = pop @else_stack; Die("unbalanced #else") unless defined($prev_else); Die("#else already seen") if $prev_else; push @else_stack, 1; pop @name_stack; push @{$diversions{$curr_enum}}, enum_val_string(aux_name_next()); $last_aux = aux_name(); push @name_stack, 0; $curr_offset = 0; } elsif (/ ^ (\# \s*) endif /x) { my $prev_else = pop @else_stack; Die("unbalanced #endif") unless defined($prev_else); pop @name_stack; # If there is no else-clause for an if-clause, we add one. This is # necessary to have correct offsets. if (!$prev_else) { # Use amount of whitespace from `endif'. push @{$diversions{$curr_enum}}, enum_val_string(aux_name_next()) . $1 . "else\n"; $last_aux = aux_name(); $curr_offset = 0; } $name_stack[$#name_stack]++; push @{$diversions{$curr_enum}}, enum_val_string(aux_name()); $last_aux = aux_name(); $curr_offset = 0; } # Handle (probably continued) preprocessor lines. CONTINUED_LOOP: { do { strip_newline(); push @{$diversions{$curr_enum}}, $ARG . "\n"; push @{$diversions{$curr_array}}, $ARG . "\n"; last CONTINUED_LOOP unless / \\ $ /x; } while (); } } else { warn_before(); } next; } if (/$enum_element_re/) { end_curr_string(); update_max_elem_size(); $curr_enum_element = $1; $have_enum_element = 1; $curr_elem_size = 0; next; } if (/$string_re/) { if ($have_sections) { Die("strings and blocks can't be mixed in a section") if $have_blocks; # Save captured group from `string_re'. my $string = $1; if ($have_enum_element) { push @{$diversions{$curr_enum}}, enum_val_string($curr_enum_element); $have_enum_element = 0; } $string = convert_literal($string); push @{$diversions{$curr_array}}, " $string\n"; $have_strings = 1; $in_string = 1; } else { warn_before(); } next; } if (/$block_start_re/) { if ($have_sections) { Die("strings and blocks can't be mixed in a section") if $have_strings; my $depth = 0; my $block = ""; my $block_end = 0; # Count braces while getting the block. BRACE_LOOP: { do { strip_newline(); foreach my $substring (split(/$brace_re/)) { if ($block_end) { Die("invalid data after last matching closing brace") if $substring !~ /$whitespace_only_re/; } $block .= $substring; if ($substring eq '{') { $depth++; } elsif ($substring eq '}') { $depth--; $block_end = 1 if $depth == 0; } } # If we are here, we have run out of substrings, so get next line # or exit. last BRACE_LOOP if $block_end; $block .= "\n"; } while (); } if ($have_enum_element) { push @{$diversions{$curr_enum}}, enum_val_string($curr_enum_element); $have_enum_element = 0; } push @{$diversions{$curr_array}}, $block . ",\n"; $curr_offset++; $curr_elem_size++; $have_blocks = 1; } else { warn_before(); } next; } # Garbage. We weren't able to parse the data. Die("syntax error"); } # Finalize data. end_curr_string(); update_max_elem_size(); # Filter stdin to stdout, replacing `@...@' templates. sub emit_diversion { my $diversion_name = shift; return (exists($diversions{$1})) ? "@{$diversions{$1}}" : "@" . $diversion_name . "@"; } $LIST_SEPARATOR = ''; my $s1 = "This file has been generated by the Perl script \`$prog',"; my $s1len = length $s1; my $s2 = "using data from file \`$datafile'."; my $s2len = length $s2; my $slen = ($s1len > $s2len) ? $s1len : $s2len; print "/* " . $s1 . " " x ($slen - $s1len) . " */\n" . "/* " . $s2 . " " x ($slen - $s2len) . " */\n" . "\n"; while () { s/ @ ( [A-Za-z0-9_]+? ) @ / emit_diversion($1) /egx; print; } # EOF