with System.IO; use System.IO;
with Ada.Characters.Handling; use Ada.Characters.Handling;
with Unchecked_Conversion;
package body GNAT.Regpat is
MAGIC : constant Character := Character'Val (10#0234#);
type Opcode is
(EOP, MINMOD,
ANY, SANY, ANYOF, EXACT, EXACTF, NOTHING, SPACE, NSPACE, DIGIT, NDIGIT, ALNUM, NALNUM,
BRANCH,
STAR, PLUS, CURLY,
CURLYX,
WHILEM,
BOL, MBOL, SBOL, EOL, MEOL, SEOL,
BOUND, NBOUND,
REFF, OPEN, CLOSE);
for Opcode'Size use 8;
type Class_Byte is mod 256;
type Character_Class is array (Class_Byte range 0 .. 31) of Class_Byte;
type Bit_Conversion_Array is array (Class_Byte range 0 .. 7) of Class_Byte;
Bit_Conversion : constant Bit_Conversion_Array :=
(1, 2, 4, 8, 16, 32, 64, 128);
type Std_Class is (ANYOF_NONE,
ANYOF_ALNUM, ANYOF_NALNUM,
ANYOF_SPACE, ANYOF_NSPACE,
ANYOF_DIGIT, ANYOF_NDIGIT,
ANYOF_ALNUMC, ANYOF_NALNUMC,
ANYOF_ALPHA, ANYOF_NALPHA,
ANYOF_ASCII, ANYOF_NASCII,
ANYOF_CNTRL, ANYOF_NCNTRL,
ANYOF_GRAPH, ANYOF_NGRAPH,
ANYOF_LOWER, ANYOF_NLOWER,
ANYOF_PRINT, ANYOF_NPRINT,
ANYOF_PUNCT, ANYOF_NPUNCT,
ANYOF_UPPER, ANYOF_NUPPER,
ANYOF_XDIGIT, ANYOF_NXDIGIT
);
procedure Set_In_Class
(Bitmap : in out Character_Class;
C : Character);
function Get_From_Class
(Bitmap : Character_Class;
C : Character) return Boolean;
procedure Reset_Class (Bitmap : out Character_Class);
pragma Inline (Set_In_Class);
pragma Inline (Get_From_Class);
pragma Inline (Reset_Class);
function "=" (Left : Character; Right : Opcode) return Boolean;
function Is_Alnum (C : Character) return Boolean;
function Is_White_Space (C : Character) return Boolean;
function Is_Printable (C : Character) return Boolean;
function Operand (P : Pointer) return Pointer;
function String_Length
(Program : Program_Data;
P : Pointer) return Program_Size;
function String_Operand (P : Pointer) return Pointer;
procedure Bitmap_Operand
(Program : Program_Data;
P : Pointer;
Op : out Character_Class);
function Get_Next_Offset
(Program : Program_Data;
IP : Pointer) return Pointer;
function Get_Next
(Program : Program_Data;
IP : Pointer) return Pointer;
procedure Optimize (Self : in out Pattern_Matcher);
function Read_Natural
(Program : Program_Data;
IP : Pointer) return Natural;
pragma Inline ("=");
pragma Inline (Is_Alnum);
pragma Inline (Is_White_Space);
pragma Inline (Get_Next);
pragma Inline (Get_Next_Offset);
pragma Inline (Operand);
pragma Inline (Read_Natural);
pragma Inline (String_Length);
pragma Inline (String_Operand);
type Expression_Flags is record
Has_Width, Simple, SP_Start : Boolean; end record;
Worst_Expression : constant Expression_Flags := (others => False);
function "=" (Left : Character; Right : Opcode) return Boolean is
begin
return Character'Pos (Left) = Opcode'Pos (Right);
end "=";
procedure Bitmap_Operand
(Program : Program_Data;
P : Pointer;
Op : out Character_Class)
is
function Convert is new Unchecked_Conversion
(Program_Data, Character_Class);
begin
Op (0 .. 31) := Convert (Program (P + 3 .. P + 34));
end Bitmap_Operand;
procedure Compile
(Matcher : out Pattern_Matcher;
Expression : String;
Final_Code_Size : out Program_Size;
Flags : Regexp_Flags := No_Flags)
is
PM : Pattern_Matcher renames Matcher;
Program : Program_Data renames PM.Program;
Emit_Code : constant Boolean := PM.Size > 0;
Emit_Ptr : Pointer := Program_First;
Parse_Pos : Natural := Expression'First; Parse_End : constant Natural := Expression'Last;
procedure Emit (B : Character);
function Emit_Node (Op : Opcode) return Pointer;
procedure Emit_Natural (IP : Pointer; N : Natural);
procedure Emit_Class (Bitmap : Character_Class);
procedure Case_Emit (C : Character);
procedure Parse
(Parenthesized : Boolean;
Flags : out Expression_Flags;
IP : out Pointer);
procedure Parse_Branch
(Flags : out Expression_Flags;
First : Boolean;
IP : out Pointer);
procedure Parse_Piece
(Expr_Flags : out Expression_Flags;
IP : out Pointer);
procedure Parse_Atom
(Expr_Flags : out Expression_Flags;
IP : out Pointer);
procedure Insert_Operator
(Op : Opcode;
Operand : Pointer;
Greedy : Boolean := True);
procedure Insert_Curly_Operator
(Op : Opcode;
Min : Natural;
Max : Natural;
Operand : Pointer;
Greedy : Boolean := True);
procedure Link_Tail (P, Val : Pointer);
procedure Link_Operand_Tail (P, Val : Pointer);
function Next_Instruction (P : Pointer) return Pointer;
procedure Fail (M : in String);
pragma No_Return (Fail);
function Is_Curly_Operator (IP : Natural) return Boolean;
function Is_Mult (IP : Natural) return Boolean;
procedure Get_Curly_Arguments
(IP : Natural;
Min : out Natural;
Max : out Natural;
Greedy : out Boolean);
procedure Parse_Character_Class (IP : out Pointer);
procedure Parse_Literal
(Expr_Flags : out Expression_Flags;
IP : out Pointer);
function Parse_Posix_Character_Class return Std_Class;
pragma Inline (Is_Mult);
pragma Inline (Emit_Natural);
pragma Inline (Parse_Character_Class);
procedure Case_Emit (C : Character) is
begin
if (Flags and Case_Insensitive) /= 0 then
Emit (To_Lower (C));
else
Emit (C);
end if;
end Case_Emit;
procedure Emit (B : Character) is
begin
if Emit_Code then
Program (Emit_Ptr) := B;
end if;
Emit_Ptr := Emit_Ptr + 1;
end Emit;
procedure Emit_Class (Bitmap : Character_Class) is
subtype Program31 is Program_Data (0 .. 31);
function Convert is new Unchecked_Conversion
(Character_Class, Program31);
begin
if Emit_Code then
Program (Emit_Ptr .. Emit_Ptr + 31) := Convert (Bitmap);
end if;
Emit_Ptr := Emit_Ptr + 32;
end Emit_Class;
procedure Emit_Natural (IP : Pointer; N : Natural) is
begin
if Emit_Code then
Program (IP + 1) := Character'Val (N / 256);
Program (IP) := Character'Val (N mod 256);
end if;
end Emit_Natural;
function Emit_Node (Op : Opcode) return Pointer is
Result : constant Pointer := Emit_Ptr;
begin
if Emit_Code then
Program (Emit_Ptr) := Character'Val (Opcode'Pos (Op));
Program (Emit_Ptr + 1) := ASCII.NUL;
Program (Emit_Ptr + 2) := ASCII.NUL;
end if;
Emit_Ptr := Emit_Ptr + 3;
return Result;
end Emit_Node;
procedure Fail (M : in String) is
begin
raise Expression_Error;
end Fail;
procedure Get_Curly_Arguments
(IP : Natural;
Min : out Natural;
Max : out Natural;
Greedy : out Boolean)
is
pragma Unreferenced (IP);
Save_Pos : Natural := Parse_Pos + 1;
begin
Min := 0;
Max := Max_Curly_Repeat;
while Expression (Parse_Pos) /= '}'
and then Expression (Parse_Pos) /= ','
loop
Parse_Pos := Parse_Pos + 1;
end loop;
Min := Natural'Value (Expression (Save_Pos .. Parse_Pos - 1));
if Expression (Parse_Pos) = ',' then
Save_Pos := Parse_Pos + 1;
while Expression (Parse_Pos) /= '}' loop
Parse_Pos := Parse_Pos + 1;
end loop;
if Save_Pos /= Parse_Pos then
Max := Natural'Value (Expression (Save_Pos .. Parse_Pos - 1));
end if;
else
Max := Min;
end if;
if Parse_Pos < Expression'Last
and then Expression (Parse_Pos + 1) = '?'
then
Greedy := False;
Parse_Pos := Parse_Pos + 1;
else
Greedy := True;
end if;
end Get_Curly_Arguments;
procedure Insert_Curly_Operator
(Op : Opcode;
Min : Natural;
Max : Natural;
Operand : Pointer;
Greedy : Boolean := True)
is
Dest : constant Pointer := Emit_Ptr;
Old : Pointer;
Size : Pointer := 7;
begin
if not Greedy then
Size := Size + 3;
end if;
if Emit_Code then
Program (Operand + Size .. Emit_Ptr + Size) :=
Program (Operand .. Emit_Ptr);
end if;
Emit_Ptr := Operand;
if not Greedy then
Old := Emit_Node (MINMOD);
Link_Tail (Old, Old + 3);
end if;
Old := Emit_Node (Op);
Emit_Natural (Old + 3, Min);
Emit_Natural (Old + 5, Max);
Emit_Ptr := Dest + Size;
end Insert_Curly_Operator;
procedure Insert_Operator
(Op : Opcode;
Operand : Pointer;
Greedy : Boolean := True)
is
Dest : constant Pointer := Emit_Ptr;
Old : Pointer;
Size : Pointer := 3;
begin
if not Greedy then
Size := Size + 3;
end if;
if Emit_Code then
Program (Operand + Size .. Emit_Ptr + Size) :=
Program (Operand .. Emit_Ptr);
end if;
Emit_Ptr := Operand;
if not Greedy then
Old := Emit_Node (MINMOD);
Link_Tail (Old, Old + 3);
end if;
Old := Emit_Node (Op);
Emit_Ptr := Dest + Size;
end Insert_Operator;
function Is_Curly_Operator (IP : Natural) return Boolean is
Scan : Natural := IP;
begin
if Expression (Scan) /= '{'
or else Scan + 2 > Expression'Last
or else not Is_Digit (Expression (Scan + 1))
then
return False;
end if;
Scan := Scan + 1;
loop
Scan := Scan + 1;
if Scan > Expression'Last then
return False;
end if;
exit when not Is_Digit (Expression (Scan));
end loop;
if Expression (Scan) = ',' then
loop
Scan := Scan + 1;
if Scan > Expression'Last then
return False;
end if;
exit when not Is_Digit (Expression (Scan));
end loop;
end if;
return Expression (Scan) = '}';
end Is_Curly_Operator;
function Is_Mult (IP : Natural) return Boolean is
C : constant Character := Expression (IP);
begin
return C = '*'
or else C = '+'
or else C = '?'
or else (C = '{' and then Is_Curly_Operator (IP));
end Is_Mult;
procedure Link_Operand_Tail (P, Val : Pointer) is
begin
if Emit_Code and then Program (P) = BRANCH then
Link_Tail (Operand (P), Val);
end if;
end Link_Operand_Tail;
procedure Link_Tail (P, Val : Pointer) is
Scan : Pointer;
Temp : Pointer;
Offset : Pointer;
begin
if not Emit_Code then
return;
end if;
Scan := P;
loop
Temp := Next_Instruction (Scan);
exit when Temp = 0;
Scan := Temp;
end loop;
Offset := Val - Scan;
Emit_Natural (Scan + 1, Natural (Offset));
end Link_Tail;
function Next_Instruction (P : Pointer) return Pointer is
Offset : Pointer;
begin
if not Emit_Code then
return 0;
end if;
Offset := Get_Next_Offset (Program, P);
if Offset = 0 then
return 0;
end if;
return P + Offset;
end Next_Instruction;
procedure Parse
(Parenthesized : in Boolean;
Flags : out Expression_Flags;
IP : out Pointer)
is
E : String renames Expression;
Br : Pointer;
Ender : Pointer;
Par_No : Natural;
New_Flags : Expression_Flags;
Have_Branch : Boolean := False;
begin
Flags := (Has_Width => True, others => False);
if Parenthesized then
if Matcher.Paren_Count > Max_Paren_Count then
Fail ("too many ()");
end if;
Par_No := Matcher.Paren_Count + 1;
Matcher.Paren_Count := Matcher.Paren_Count + 1;
IP := Emit_Node (OPEN);
Emit (Character'Val (Par_No));
else
IP := 0;
Par_No := 0;
end if;
Parse_Branch (New_Flags, True, Br);
if Br = 0 then
IP := 0;
return;
end if;
if Parse_Pos <= Parse_End
and then E (Parse_Pos) = '|'
then
Insert_Operator (BRANCH, Br);
Have_Branch := True;
end if;
if IP /= 0 then
Link_Tail (IP, Br); else
IP := Br;
end if;
if not New_Flags.Has_Width then
Flags.Has_Width := False;
end if;
Flags.SP_Start := Flags.SP_Start or New_Flags.SP_Start;
while Parse_Pos <= Parse_End
and then (E (Parse_Pos) = '|')
loop
Parse_Pos := Parse_Pos + 1;
Parse_Branch (New_Flags, False, Br);
if Br = 0 then
IP := 0;
return;
end if;
Link_Tail (IP, Br);
if not New_Flags.Has_Width then
Flags.Has_Width := False;
end if;
Flags.SP_Start := Flags.SP_Start or New_Flags.SP_Start;
end loop;
if Parenthesized then
Ender := Emit_Node (CLOSE);
Emit (Character'Val (Par_No));
else
Ender := Emit_Node (EOP);
end if;
Link_Tail (IP, Ender);
if Have_Branch then
Br := IP;
loop
exit when Br = 0;
Link_Operand_Tail (Br, Ender);
Br := Next_Instruction (Br);
end loop;
end if;
if Parenthesized then
if Parse_Pos > Parse_End or else E (Parse_Pos) /= ')' then
Fail ("unmatched ()");
end if;
Parse_Pos := Parse_Pos + 1;
elsif Parse_Pos <= Parse_End then
if E (Parse_Pos) = ')' then
Fail ("unmatched ()");
else
Fail ("junk on end"); end if;
end if;
end Parse;
procedure Parse_Atom
(Expr_Flags : out Expression_Flags;
IP : out Pointer)
is
C : Character;
begin
Expr_Flags := Worst_Expression;
C := Expression (Parse_Pos);
Parse_Pos := Parse_Pos + 1;
case (C) is
when '^' =>
if (Flags and Multiple_Lines) /= 0 then
IP := Emit_Node (MBOL);
elsif (Flags and Single_Line) /= 0 then
IP := Emit_Node (SBOL);
else
IP := Emit_Node (BOL);
end if;
when '$' =>
if (Flags and Multiple_Lines) /= 0 then
IP := Emit_Node (MEOL);
elsif (Flags and Single_Line) /= 0 then
IP := Emit_Node (SEOL);
else
IP := Emit_Node (EOL);
end if;
when '.' =>
if (Flags and Single_Line) /= 0 then
IP := Emit_Node (SANY);
else
IP := Emit_Node (ANY);
end if;
Expr_Flags.Has_Width := True;
Expr_Flags.Simple := True;
when '[' =>
Parse_Character_Class (IP);
Expr_Flags.Has_Width := True;
Expr_Flags.Simple := True;
when '(' =>
declare
New_Flags : Expression_Flags;
begin
Parse (True, New_Flags, IP);
if IP = 0 then
return;
end if;
Expr_Flags.Has_Width :=
Expr_Flags.Has_Width or New_Flags.Has_Width;
Expr_Flags.SP_Start :=
Expr_Flags.SP_Start or New_Flags.SP_Start;
end;
when '|' | ASCII.LF | ')' =>
Fail ("internal urp");
when '?' | '+' | '*' =>
Fail (C & " follows nothing");
when '{' =>
if Is_Curly_Operator (Parse_Pos - 1) then
Fail (C & " follows nothing");
else
Parse_Literal (Expr_Flags, IP);
end if;
when '\' =>
if Parse_Pos > Parse_End then
Fail ("trailing \");
end if;
Parse_Pos := Parse_Pos + 1;
case Expression (Parse_Pos - 1) is
when 'b' =>
IP := Emit_Node (BOUND);
when 'B' =>
IP := Emit_Node (NBOUND);
when 's' =>
IP := Emit_Node (SPACE);
Expr_Flags.Simple := True;
Expr_Flags.Has_Width := True;
when 'S' =>
IP := Emit_Node (NSPACE);
Expr_Flags.Simple := True;
Expr_Flags.Has_Width := True;
when 'd' =>
IP := Emit_Node (DIGIT);
Expr_Flags.Simple := True;
Expr_Flags.Has_Width := True;
when 'D' =>
IP := Emit_Node (NDIGIT);
Expr_Flags.Simple := True;
Expr_Flags.Has_Width := True;
when 'w' =>
IP := Emit_Node (ALNUM);
Expr_Flags.Simple := True;
Expr_Flags.Has_Width := True;
when 'W' =>
IP := Emit_Node (NALNUM);
Expr_Flags.Simple := True;
Expr_Flags.Has_Width := True;
when 'A' =>
IP := Emit_Node (SBOL);
when 'G' =>
IP := Emit_Node (SEOL);
when '0' .. '9' =>
IP := Emit_Node (REFF);
declare
Save : constant Natural := Parse_Pos - 1;
begin
while Parse_Pos <= Expression'Last
and then Is_Digit (Expression (Parse_Pos))
loop
Parse_Pos := Parse_Pos + 1;
end loop;
Emit (Character'Val (Natural'Value
(Expression (Save .. Parse_Pos - 1))));
end;
when others =>
Parse_Pos := Parse_Pos - 1;
Parse_Literal (Expr_Flags, IP);
end case;
when others =>
Parse_Literal (Expr_Flags, IP);
end case;
end Parse_Atom;
------------------
-- Parse_Branch --
------------------
procedure Parse_Branch
(Flags : out Expression_Flags;
First : Boolean;
IP : out Pointer)
is
E : String renames Expression;
Chain : Pointer;
Last : Pointer;
New_Flags : Expression_Flags;
Discard : Pointer;
pragma Warnings (Off, Discard);
begin
Flags := Worst_Expression; -- Tentatively
if First then
IP := Emit_Ptr;
else
IP := Emit_Node (BRANCH);
end if;
Chain := 0;
while Parse_Pos <= Parse_End
and then E (Parse_Pos) /= ')'
and then E (Parse_Pos) /= ASCII.LF
and then E (Parse_Pos) /= '|'
loop
Parse_Piece (New_Flags, Last);
if Last = 0 then
IP := 0;
return;
end if;
Flags.Has_Width := Flags.Has_Width or New_Flags.Has_Width;
if Chain = 0 then -- First piece
Flags.SP_Start := Flags.SP_Start or New_Flags.SP_Start;
else
Link_Tail (Chain, Last);
end if;
Chain := Last;
end loop;
-- Case where loop ran zero CURLY
if Chain = 0 then
Discard := Emit_Node (NOTHING);
end if;
end Parse_Branch;
---------------------------
-- Parse_Character_Class --
---------------------------
procedure Parse_Character_Class (IP : out Pointer) is
Bitmap : Character_Class;
Invert : Boolean := False;
In_Range : Boolean := False;
Named_Class : Std_Class := ANYOF_NONE;
Value : Character;
Last_Value : Character := ASCII.Nul;
begin
Reset_Class (Bitmap);
-- Do we have an invert character class ?
if Parse_Pos <= Parse_End
and then Expression (Parse_Pos) = '^'
then
Invert := True;
Parse_Pos := Parse_Pos + 1;
end if;
-- First character can be ] or -, without closing the class.
if Parse_Pos <= Parse_End
and then (Expression (Parse_Pos) = ']'
or else Expression (Parse_Pos) = '-')
then
Set_In_Class (Bitmap, Expression (Parse_Pos));
Parse_Pos := Parse_Pos + 1;
end if;
-- While we don't have the end of the class
while Parse_Pos <= Parse_End
and then Expression (Parse_Pos) /= ']'
loop
Named_Class := ANYOF_NONE;
Value := Expression (Parse_Pos);
Parse_Pos := Parse_Pos + 1;
-- Do we have a Posix character class
if Value = '[' then
Named_Class := Parse_Posix_Character_Class;
elsif Value = '\' then
if Parse_Pos = Parse_End then
Fail ("Trailing \");
end if;
Value := Expression (Parse_Pos);
Parse_Pos := Parse_Pos + 1;
case Value is
when 'w' => Named_Class := ANYOF_ALNUM;
when 'W' => Named_Class := ANYOF_NALNUM;
when 's' => Named_Class := ANYOF_SPACE;
when 'S' => Named_Class := ANYOF_NSPACE;
when 'd' => Named_Class := ANYOF_DIGIT;
when 'D' => Named_Class := ANYOF_NDIGIT;
when 'n' => Value := ASCII.LF;
when 'r' => Value := ASCII.CR;
when 't' => Value := ASCII.HT;
when 'f' => Value := ASCII.FF;
when 'e' => Value := ASCII.ESC;
when 'a' => Value := ASCII.BEL;
-- when 'x' => ??? hexadecimal value
-- when 'c' => ??? control character
-- when '0'..'9' => ??? octal character
when others => null;
end case;
end if;
-- Do we have a character class?
if Named_Class /= ANYOF_NONE then
-- A range like 'a-\d' or 'a-[:digit:] is not a range
if In_Range then
Set_In_Class (Bitmap, Last_Value);
Set_In_Class (Bitmap, '-');
In_Range := False;
end if;
-- Expand the range
case Named_Class is
when ANYOF_NONE => null;
when ANYOF_ALNUM | ANYOF_ALNUMC =>
for Value in Class_Byte'Range loop
if Is_Alnum (Character'Val (Value)) then
Set_In_Class (Bitmap, Character'Val (Value));
end if;
end loop;
when ANYOF_NALNUM | ANYOF_NALNUMC =>
for Value in Class_Byte'Range loop
if not Is_Alnum (Character'Val (Value)) then
Set_In_Class (Bitmap, Character'Val (Value));
end if;
end loop;
when ANYOF_SPACE =>
for Value in Class_Byte'Range loop
if Is_White_Space (Character'Val (Value)) then
Set_In_Class (Bitmap, Character'Val (Value));
end if;
end loop;
when ANYOF_NSPACE =>
for Value in Class_Byte'Range loop
if not Is_White_Space (Character'Val (Value)) then
Set_In_Class (Bitmap, Character'Val (Value));
end if;
end loop;
when ANYOF_DIGIT =>
for Value in Class_Byte'Range loop
if Is_Digit (Character'Val (Value)) then
Set_In_Class (Bitmap, Character'Val (Value));
end if;
end loop;
when ANYOF_NDIGIT =>
for Value in Class_Byte'Range loop
if not Is_Digit (Character'Val (Value)) then
Set_In_Class (Bitmap, Character'Val (Value));
end if;
end loop;
when ANYOF_ALPHA =>
for Value in Class_Byte'Range loop
if Is_Letter (Character'Val (Value)) then
Set_In_Class (Bitmap, Character'Val (Value));
end if;
end loop;
when ANYOF_NALPHA =>
for Value in Class_Byte'Range loop
if not Is_Letter (Character'Val (Value)) then
Set_In_Class (Bitmap, Character'Val (Value));
end if;
end loop;
when ANYOF_ASCII =>
for Value in 0 .. 127 loop
Set_In_Class (Bitmap, Character'Val (Value));
end loop;
when ANYOF_NASCII =>
for Value in 128 .. 255 loop
Set_In_Class (Bitmap, Character'Val (Value));
end loop;
when ANYOF_CNTRL =>
for Value in Class_Byte'Range loop
if Is_Control (Character'Val (Value)) then
Set_In_Class (Bitmap, Character'Val (Value));
end if;
end loop;
when ANYOF_NCNTRL =>
for Value in Class_Byte'Range loop
if not Is_Control (Character'Val (Value)) then
Set_In_Class (Bitmap, Character'Val (Value));
end if;
end loop;
when ANYOF_GRAPH =>
for Value in Class_Byte'Range loop
if Is_Graphic (Character'Val (Value)) then
Set_In_Class (Bitmap, Character'Val (Value));
end if;
end loop;
when ANYOF_NGRAPH =>
for Value in Class_Byte'Range loop
if not Is_Graphic (Character'Val (Value)) then
Set_In_Class (Bitmap, Character'Val (Value));
end if;
end loop;
when ANYOF_LOWER =>
for Value in Class_Byte'Range loop
if Is_Lower (Character'Val (Value)) then
Set_In_Class (Bitmap, Character'Val (Value));
end if;
end loop;
when ANYOF_NLOWER =>
for Value in Class_Byte'Range loop
if not Is_Lower (Character'Val (Value)) then
Set_In_Class (Bitmap, Character'Val (Value));
end if;
end loop;
when ANYOF_PRINT =>
for Value in Class_Byte'Range loop
if Is_Printable (Character'Val (Value)) then
Set_In_Class (Bitmap, Character'Val (Value));
end if;
end loop;
when ANYOF_NPRINT =>
for Value in Class_Byte'Range loop
if not Is_Printable (Character'Val (Value)) then
Set_In_Class (Bitmap, Character'Val (Value));
end if;
end loop;
when ANYOF_PUNCT =>
for Value in Class_Byte'Range loop
if Is_Printable (Character'Val (Value))
and then not Is_White_Space (Character'Val (Value))
and then not Is_Alnum (Character'Val (Value))
then
Set_In_Class (Bitmap, Character'Val (Value));
end if;
end loop;
when ANYOF_NPUNCT =>
for Value in Class_Byte'Range loop
if not Is_Printable (Character'Val (Value))
or else Is_White_Space (Character'Val (Value))
or else Is_Alnum (Character'Val (Value))
then
Set_In_Class (Bitmap, Character'Val (Value));
end if;
end loop;
when ANYOF_UPPER =>
for Value in Class_Byte'Range loop
if Is_Upper (Character'Val (Value)) then
Set_In_Class (Bitmap, Character'Val (Value));
end if;
end loop;
when ANYOF_NUPPER =>
for Value in Class_Byte'Range loop
if not Is_Upper (Character'Val (Value)) then
Set_In_Class (Bitmap, Character'Val (Value));
end if;
end loop;
when ANYOF_XDIGIT =>
for Value in Class_Byte'Range loop
if Is_Hexadecimal_Digit (Character'Val (Value)) then
Set_In_Class (Bitmap, Character'Val (Value));
end if;
end loop;
when ANYOF_NXDIGIT =>
for Value in Class_Byte'Range loop
if not Is_Hexadecimal_Digit
(Character'Val (Value))
then
Set_In_Class (Bitmap, Character'Val (Value));
end if;
end loop;
end case;
-- Not a character range
elsif not In_Range then
Last_Value := Value;
if Expression (Parse_Pos) = '-'
and then Parse_Pos < Parse_End
and then Expression (Parse_Pos + 1) /= ']'
then
Parse_Pos := Parse_Pos + 1;
-- Do we have a range like '\d-a' and '[:space:]-a'
-- which is not a real range
if Named_Class /= ANYOF_NONE then
Set_In_Class (Bitmap, '-');
else
In_Range := True;
end if;
else
Set_In_Class (Bitmap, Value);
end if;
-- Else in a character range
else
if Last_Value > Value then
Fail ("Invalid Range [" & Last_Value'Img
& "-" & Value'Img & "]");
end if;
while Last_Value <= Value loop
Set_In_Class (Bitmap, Last_Value);
Last_Value := Character'Succ (Last_Value);
end loop;
In_Range := False;
end if;
end loop;
-- Optimize case-insensitive ranges (put the upper case or lower
-- case character into the bitmap)
if (Flags and Case_Insensitive) /= 0 then
for C in Character'Range loop
if Get_From_Class (Bitmap, C) then
Set_In_Class (Bitmap, To_Lower (C));
Set_In_Class (Bitmap, To_Upper (C));
end if;
end loop;
end if;
-- Optimize inverted classes
if Invert then
for J in Bitmap'Range loop
Bitmap (J) := not Bitmap (J);
end loop;
end if;
Parse_Pos := Parse_Pos + 1;
-- Emit the class
IP := Emit_Node (ANYOF);
Emit_Class (Bitmap);
end Parse_Character_Class;
-------------------
-- Parse_Literal --
-------------------
-- This is a bit tricky due to quoted chars and due to
-- the multiplier characters '*', '+', and '?' that
-- take the SINGLE char previous as their operand.
-- On entry, the character at Parse_Pos - 1 is going to go
-- into the string, no matter what it is. It could be
-- following a \ if Parse_Atom was entered from the '\' case.
-- Basic idea is to pick up a good char in C and examine
-- the next char. If Is_Mult (C) then twiddle, if it's a \
-- then frozzle and if it's another magic char then push C and
-- terminate the string. If none of the above, push C on the
-- string and go around again.
-- Start_Pos is used to remember where "the current character"
-- starts in the string, if due to an Is_Mult we need to back
-- up and put the current char in a separate 1-character string.
-- When Start_Pos is 0, C is the only char in the string;
-- this is used in Is_Mult handling, and in setting the SIMPLE
-- flag at the end.
procedure Parse_Literal
(Expr_Flags : out Expression_Flags;
IP : out Pointer)
is
Start_Pos : Natural := 0;
C : Character;
Length_Ptr : Pointer;
Has_Special_Operator : Boolean := False;
begin
Parse_Pos := Parse_Pos - 1; -- Look at current character
if (Flags and Case_Insensitive) /= 0 then
IP := Emit_Node (EXACTF);
else
IP := Emit_Node (EXACT);
end if;
Length_Ptr := Emit_Ptr;
Emit_Ptr := String_Operand (IP);
Parse_Loop :
loop
C := Expression (Parse_Pos); -- Get current character
case C is
when '.' | '[' | '(' | ')' | '|' | ASCII.LF | '$' | '^' =>
if Start_Pos = 0 then
Start_Pos := Parse_Pos;
Emit (C); -- First character is always emitted
else
exit Parse_Loop; -- Else we are done
end if;
when '?' | '+' | '*' | '{' =>
if Start_Pos = 0 then
Start_Pos := Parse_Pos;
Emit (C); -- First character is always emitted
-- Are we looking at an operator, or is this
-- simply a normal character ?
elsif not Is_Mult (Parse_Pos) then
Start_Pos := Parse_Pos;
Case_Emit (C);
else
-- We've got something like "abc?d". Mark this as a
-- special case. What we want to emit is a first
-- constant string for "ab", then one for "c" that will
-- ultimately be transformed with a CURLY operator, A
-- special case has to be handled for "a?", since there
-- is no initial string to emit.
Has_Special_Operator := True;
exit Parse_Loop;
end if;
when '\' =>
Start_Pos := Parse_Pos;
if Parse_Pos = Parse_End then
Fail ("Trailing \");
else
case Expression (Parse_Pos + 1) is
when 'b' | 'B' | 's' | 'S' | 'd' | 'D'
| 'w' | 'W' | '0' .. '9' | 'G' | 'A'
=> exit Parse_Loop;
when 'n' => Emit (ASCII.LF);
when 't' => Emit (ASCII.HT);
when 'r' => Emit (ASCII.CR);
when 'f' => Emit (ASCII.FF);
when 'e' => Emit (ASCII.ESC);
when 'a' => Emit (ASCII.BEL);
when others => Emit (Expression (Parse_Pos + 1));
end case;
Parse_Pos := Parse_Pos + 1;
end if;
when others =>
Start_Pos := Parse_Pos;
Case_Emit (C);
end case;
exit Parse_Loop when Emit_Ptr - Length_Ptr = 254;
Parse_Pos := Parse_Pos + 1;
exit Parse_Loop when Parse_Pos > Parse_End;
end loop Parse_Loop;
-- Is the string followed by a '*+?{' operator ? If yes, and if there
-- is an initial string to emit, do it now.
if Has_Special_Operator
and then Emit_Ptr >= Length_Ptr + 3
then
Emit_Ptr := Emit_Ptr - 1;
Parse_Pos := Start_Pos;
end if;
if Emit_Code then
Program (Length_Ptr) := Character'Val (Emit_Ptr - Length_Ptr - 2);
end if;
Expr_Flags.Has_Width := True;
-- Slight optimization when there is a single character
if Emit_Ptr = Length_Ptr + 2 then
Expr_Flags.Simple := True;
end if;
end Parse_Literal;
-----------------
-- Parse_Piece --
-----------------
-- Note that the branching code sequences used for '?' and the
-- general cases of '*' and + are somewhat optimized: they use
-- the same NOTHING node as both the endmarker for their branch
-- list and the body of the last branch. It might seem that
-- this node could be dispensed with entirely, but the endmarker
-- role is not redundant.
procedure Parse_Piece
(Expr_Flags : out Expression_Flags;
IP : out Pointer)
is
Op : Character;
New_Flags : Expression_Flags;
Greedy : Boolean := True;
begin
Parse_Atom (New_Flags, IP);
if IP = 0 then
return;
end if;
if Parse_Pos > Parse_End
or else not Is_Mult (Parse_Pos)
then
Expr_Flags := New_Flags;
return;
end if;
Op := Expression (Parse_Pos);
if Op /= '+' then
Expr_Flags := (SP_Start => True, others => False);
else
Expr_Flags := (Has_Width => True, others => False);
end if;
-- Detect non greedy operators in the easy cases
if Op /= '{'
and then Parse_Pos + 1 <= Parse_End
and then Expression (Parse_Pos + 1) = '?'
then
Greedy := False;
Parse_Pos := Parse_Pos + 1;
end if;
-- Generate the byte code
case Op is
when '*' =>
if New_Flags.Simple then
Insert_Operator (STAR, IP, Greedy);
else
Link_Tail (IP, Emit_Node (WHILEM));
Insert_Curly_Operator
(CURLYX, 0, Max_Curly_Repeat, IP, Greedy);
Link_Tail (IP, Emit_Node (NOTHING));
end if;
when '+' =>
if New_Flags.Simple then
Insert_Operator (PLUS, IP, Greedy);
else
Link_Tail (IP, Emit_Node (WHILEM));
Insert_Curly_Operator
(CURLYX, 1, Max_Curly_Repeat, IP, Greedy);
Link_Tail (IP, Emit_Node (NOTHING));
end if;
when '?' =>
if New_Flags.Simple then
Insert_Curly_Operator (CURLY, 0, 1, IP, Greedy);
else
Link_Tail (IP, Emit_Node (WHILEM));
Insert_Curly_Operator (CURLYX, 0, 1, IP, Greedy);
Link_Tail (IP, Emit_Node (NOTHING));
end if;
when '{' =>
declare
Min, Max : Natural;
begin
Get_Curly_Arguments (Parse_Pos, Min, Max, Greedy);
if New_Flags.Simple then
Insert_Curly_Operator (CURLY, Min, Max, IP, Greedy);
else
Link_Tail (IP, Emit_Node (WHILEM));
Insert_Curly_Operator (CURLYX, Min, Max, IP, Greedy);
Link_Tail (IP, Emit_Node (NOTHING));
end if;
end;
when others =>
null;
end case;
Parse_Pos := Parse_Pos + 1;
if Parse_Pos <= Parse_End
and then Is_Mult (Parse_Pos)
then
Fail ("nested *+{");
end if;
end Parse_Piece;
---------------------------------
-- Parse_Posix_Character_Class --
---------------------------------
function Parse_Posix_Character_Class return Std_Class is
Invert : Boolean := False;
Class : Std_Class := ANYOF_NONE;
E : String renames Expression;
-- Class names. Note that code assumes that the length of all
-- classes starting with the same letter have the same length.
Alnum : constant String := "alnum:]";
Alpha : constant String := "alpha:]";
Ascii_C : constant String := "ascii:]";
Cntrl : constant String := "cntrl:]";
Digit : constant String := "digit:]";
Graph : constant String := "graph:]";
Lower : constant String := "lower:]";
Print : constant String := "print:]";
Punct : constant String := "punct:]";
Space : constant String := "space:]";
Upper : constant String := "upper:]";
Word : constant String := "word:]";
Xdigit : constant String := "xdigit:]";
begin
-- Case of character class specified
if Parse_Pos <= Parse_End
and then Expression (Parse_Pos) = ':'
then
Parse_Pos := Parse_Pos + 1;
-- Do we have something like: [[:^alpha:]]
if Parse_Pos <= Parse_End
and then Expression (Parse_Pos) = '^'
then
Invert := True;
Parse_Pos := Parse_Pos + 1;
end if;
-- Check for class names based on first letter
case Expression (Parse_Pos) is
when 'a' =>
-- All 'a' classes have the same length (Alnum'Length)
if Parse_Pos + Alnum'Length - 1 <= Parse_End then
if E (Parse_Pos .. Parse_Pos + Alnum'Length - 1) =
Alnum
then
if Invert then
Class := ANYOF_NALNUMC;
else
Class := ANYOF_ALNUMC;
end if;
Parse_Pos := Parse_Pos + Alnum'Length;
elsif E (Parse_Pos .. Parse_Pos + Alpha'Length - 1) =
Alpha
then
if Invert then
Class := ANYOF_NALPHA;
else
Class := ANYOF_ALPHA;
end if;
Parse_Pos := Parse_Pos + Alpha'Length;
elsif E (Parse_Pos .. Parse_Pos + Ascii_C'Length - 1) =
Ascii_C
then
if Invert then
Class := ANYOF_NASCII;
else
Class := ANYOF_ASCII;
end if;
Parse_Pos := Parse_Pos + Ascii_C'Length;
end if;
end if;
when 'c' =>
if Parse_Pos + Cntrl'Length - 1 <= Parse_End
and then E (Parse_Pos .. Parse_Pos + Cntrl'Length - 1) =
Cntrl
then
if Invert then
Class := ANYOF_NCNTRL;
else
Class := ANYOF_CNTRL;
end if;
Parse_Pos := Parse_Pos + Cntrl'Length;
end if;
when 'd' =>
if Parse_Pos + Digit'Length - 1 <= Parse_End
and then E (Parse_Pos .. Parse_Pos + Digit'Length - 1) =
Digit
then
if Invert then
Class := ANYOF_NDIGIT;
else
Class := ANYOF_DIGIT;
end if;
Parse_Pos := Parse_Pos + Digit'Length;
end if;
when 'g' =>
if Parse_Pos + Graph'Length - 1 <= Parse_End
and then E (Parse_Pos .. Parse_Pos + Graph'Length - 1) =
Graph
then
if Invert then
Class := ANYOF_NGRAPH;
else
Class := ANYOF_GRAPH;
end if;
Parse_Pos := Parse_Pos + Graph'Length;
end if;
when 'l' =>
if Parse_Pos + Lower'Length - 1 <= Parse_End
and then E (Parse_Pos .. Parse_Pos + Lower'Length - 1) =
Lower
then
if Invert then
Class := ANYOF_NLOWER;
else
Class := ANYOF_LOWER;
end if;
Parse_Pos := Parse_Pos + Lower'Length;
end if;
when 'p' =>
-- All 'p' classes have the same length
if Parse_Pos + Print'Length - 1 <= Parse_End then
if E (Parse_Pos .. Parse_Pos + Print'Length - 1) =
Print
then
if Invert then
Class := ANYOF_NPRINT;
else
Class := ANYOF_PRINT;
end if;
Parse_Pos := Parse_Pos + Print'Length;
elsif E (Parse_Pos .. Parse_Pos + Punct'Length - 1) =
Punct
then
if Invert then
Class := ANYOF_NPUNCT;
else
Class := ANYOF_PUNCT;
end if;
Parse_Pos := Parse_Pos + Punct'Length;
end if;
end if;
when 's' =>
if Parse_Pos + Space'Length - 1 <= Parse_End
and then E (Parse_Pos .. Parse_Pos + Space'Length - 1) =
Space
then
if Invert then
Class := ANYOF_NSPACE;
else
Class := ANYOF_SPACE;
end if;
Parse_Pos := Parse_Pos + Space'Length;
end if;
when 'u' =>
if Parse_Pos + Upper'Length - 1 <= Parse_End
and then E (Parse_Pos .. Parse_Pos + Upper'Length - 1) =
Upper
then
if Invert then
Class := ANYOF_NUPPER;
else
Class := ANYOF_UPPER;
end if;
Parse_Pos := Parse_Pos + Upper'Length;
end if;
when 'w' =>
if Parse_Pos + Word'Length - 1 <= Parse_End
and then E (Parse_Pos .. Parse_Pos + Word'Length - 1) =
Word
then
if Invert then
Class := ANYOF_NALNUM;
else
Class := ANYOF_ALNUM;
end if;
Parse_Pos := Parse_Pos + Word'Length;
end if;
when 'x' =>
if Parse_Pos + Xdigit'Length - 1 <= Parse_End
and then E (Parse_Pos .. Parse_Pos + Xdigit'Length - 1)
= Digit
then
if Invert then
Class := ANYOF_NXDIGIT;
else
Class := ANYOF_XDIGIT;
end if;
Parse_Pos := Parse_Pos + Xdigit'Length;
end if;
when others =>
Fail ("Invalid character class");
end case;
-- Character class not specified
else
return ANYOF_NONE;
end if;
return Class;
end Parse_Posix_Character_Class;
Expr_Flags : Expression_Flags;
Result : Pointer;
-- Start of processing for Compile
begin
Emit (MAGIC);
Parse (False, Expr_Flags, Result);
if Result = 0 then
Fail ("Couldn't compile expression");
end if;
Final_Code_Size := Emit_Ptr - 1;
-- Do we want to actually compile the expression, or simply get the
-- code size ???
if Emit_Code then
Optimize (PM);
end if;
PM.Flags := Flags;
end Compile;
function Compile
(Expression : String;
Flags : Regexp_Flags := No_Flags) return Pattern_Matcher
is
Size : Program_Size;
Dummy : Pattern_Matcher (0);
begin
Compile (Dummy, Expression, Size, Flags);
declare
Result : Pattern_Matcher (Size);
begin
Compile (Result, Expression, Size, Flags);
return Result;
end;
end Compile;
procedure Compile
(Matcher : out Pattern_Matcher;
Expression : String;
Flags : Regexp_Flags := No_Flags)
is
Size : Program_Size;
begin
Compile (Matcher, Expression, Size, Flags);
end Compile;
----------
-- Dump --
----------
procedure Dump (Self : Pattern_Matcher) is
-- Index : Pointer := Program_First + 1;
-- What is the above line for ???
Op : Opcode;
Program : Program_Data renames Self.Program;
procedure Dump_Until
(Start : Pointer;
Till : Pointer;
Indent : Natural := 0);
-- Dump the program until the node Till (not included) is met.
-- Every line is indented with Index spaces at the beginning
-- Dumps till the end if Till is 0.
----------------
-- Dump_Until --
----------------
procedure Dump_Until
(Start : Pointer;
Till : Pointer;
Indent : Natural := 0)
is
Next : Pointer;
Index : Pointer := Start;
Local_Indent : Natural := Indent;
Length : Pointer;
begin
while Index < Till loop
Op := Opcode'Val (Character'Pos ((Self.Program (Index))));
if Op = CLOSE then
Local_Indent := Local_Indent - 3;
end if;
declare
Point : constant String := Pointer'Image (Index);
begin
for J in 1 .. 6 - Point'Length loop
Put (' ');
end loop;
Put (Point
& " : "
& (1 .. Local_Indent => ' ')
& Opcode'Image (Op));
end;
-- Print the parenthesis number
if Op = OPEN or else Op = CLOSE or else Op = REFF then
Put (Natural'Image (Character'Pos (Program (Index + 3))));
end if;
Next := Index + Get_Next_Offset (Program, Index);
if Next = Index then
Put (" (next at 0)");
else
Put (" (next at " & Pointer'Image (Next) & ")");
end if;
case Op is
-- Character class operand
when ANYOF => null;
declare
Bitmap : Character_Class;
Last : Character := ASCII.Nul;
Current : Natural := 0;
Current_Char : Character;
begin
Bitmap_Operand (Program, Index, Bitmap);
Put (" operand=");
while Current <= 255 loop
Current_Char := Character'Val (Current);
-- First item in a range
if Get_From_Class (Bitmap, Current_Char) then
Last := Current_Char;
-- Search for the last item in the range
loop
Current := Current + 1;
exit when Current > 255;
Current_Char := Character'Val (Current);
exit when
not Get_From_Class (Bitmap, Current_Char);
end loop;
if Last <= ' ' then
Put (Last'Img);
else
Put (Last);
end if;
if Character'Succ (Last) /= Current_Char then
Put ("-" & Character'Pred (Current_Char));
end if;
else
Current := Current + 1;
end if;
end loop;
New_Line;
Index := Index + 3 + Bitmap'Length;
end;
-- string operand
when EXACT | EXACTF =>
Length := String_Length (Program, Index);
Put (" operand (length:" & Program_Size'Image (Length + 1)
& ") ="
& String (Program (String_Operand (Index)
.. String_Operand (Index)
+ Length)));
Index := String_Operand (Index) + Length + 1;
New_Line;
-- Node operand
when BRANCH =>
New_Line;
Dump_Until (Index + 3, Next, Local_Indent + 3);
Index := Next;
when STAR | PLUS =>
New_Line;
-- Only one instruction
Dump_Until (Index + 3, Index + 4, Local_Indent + 3);
Index := Next;
when CURLY | CURLYX =>
Put (" {"
& Natural'Image (Read_Natural (Program, Index + 3))
& ","
& Natural'Image (Read_Natural (Program, Index + 5))
& "}");
New_Line;
Dump_Until (Index + 7, Next, Local_Indent + 3);
Index := Next;
when OPEN =>
New_Line;
Index := Index + 4;
Local_Indent := Local_Indent + 3;
when CLOSE | REFF =>
New_Line;
Index := Index + 4;
when EOP =>
Index := Index + 3;
New_Line;
exit;
-- No operand
when others =>
Index := Index + 3;
New_Line;
end case;
end loop;
end Dump_Until;
-- Start of processing for Dump
begin
pragma Assert (Self.Program (Program_First) = MAGIC,
"Corrupted Pattern_Matcher");
Put_Line ("Must start with (Self.First) = "
& Character'Image (Self.First));
if (Self.Flags and Case_Insensitive) /= 0 then
Put_Line (" Case_Insensitive mode");
end if;
if (Self.Flags and Single_Line) /= 0 then
Put_Line (" Single_Line mode");
end if;
if (Self.Flags and Multiple_Lines) /= 0 then
Put_Line (" Multiple_Lines mode");
end if;
Put_Line (" 1 : MAGIC");
Dump_Until (Program_First + 1, Self.Program'Last + 1);
end Dump;
--------------------
-- Get_From_Class --
--------------------
function Get_From_Class
(Bitmap : Character_Class;
C : Character) return Boolean
is
Value : constant Class_Byte := Character'Pos (C);
begin
return
(Bitmap (Value / 8) and Bit_Conversion (Value mod 8)) /= 0;
end Get_From_Class;
--------------
-- Get_Next --
--------------
function Get_Next (Program : Program_Data; IP : Pointer) return Pointer is
Offset : constant Pointer := Get_Next_Offset (Program, IP);
begin
if Offset = 0 then
return 0;
else
return IP + Offset;
end if;
end Get_Next;
---------------------
-- Get_Next_Offset --
---------------------
function Get_Next_Offset
(Program : Program_Data;
IP : Pointer) return Pointer
is
begin
return Pointer (Read_Natural (Program, IP + 1));
end Get_Next_Offset;
--------------
-- Is_Alnum --
--------------
function Is_Alnum (C : Character) return Boolean is
begin
return Is_Alphanumeric (C) or else C = '_';
end Is_Alnum;
------------------
-- Is_Printable --
------------------
function Is_Printable (C : Character) return Boolean is
begin
-- Printable if space or graphic character or other whitespace
-- Other white space includes (HT/LF/VT/FF/CR = codes 9-13)
return C in Character'Val (32) .. Character'Val (126)
or else C in ASCII.HT .. ASCII.CR;
end Is_Printable;
--------------------
-- Is_White_Space --
--------------------
function Is_White_Space (C : Character) return Boolean is
begin
-- Note: HT = 9, LF = 10, VT = 11, FF = 12, CR = 13
return C = ' ' or else C in ASCII.HT .. ASCII.CR;
end Is_White_Space;
-----------
-- Match --
-----------
procedure Match
(Self : Pattern_Matcher;
Data : String;
Matches : out Match_Array;
Data_First : Integer := -1;
Data_Last : Positive := Positive'Last)
is
Program : Program_Data renames Self.Program; -- Shorter notation
First_In_Data : constant Integer := Integer'Max (Data_First, Data'First);
Last_In_Data : constant Integer := Integer'Min (Data_Last, Data'Last);
-- Global work variables
Input_Pos : Natural; -- String-input pointer
BOL_Pos : Natural; -- Beginning of input, for ^ check
Matched : Boolean := False; -- Until proven True
Matches_Full : Match_Array (0 .. Natural'Max (Self.Paren_Count,
Matches'Last));
-- Stores the value of all the parenthesis pairs.
-- We do not use directly Matches, so that we can also use back
-- references (REFF) even if Matches is too small.
type Natural_Array is array (Match_Count range <>) of Natural;
Matches_Tmp : Natural_Array (Matches_Full'Range);
-- Save the opening position of parenthesis.
Last_Paren : Natural := 0;
-- Last parenthesis seen
Greedy : Boolean := True;
-- True if the next operator should be greedy
type Current_Curly_Record;
type Current_Curly_Access is access all Current_Curly_Record;
type Current_Curly_Record is record
Paren_Floor : Natural; -- How far back to strip parenthesis data
Cur : Integer; -- How many instances of scan we've matched
Min : Natural; -- Minimal number of scans to match
Max : Natural; -- Maximal number of scans to match
Greedy : Boolean; -- Whether to work our way up or down
Scan : Pointer; -- The thing to match
Next : Pointer; -- What has to match after it
Lastloc : Natural; -- Where we started matching this scan
Old_Cc : Current_Curly_Access; -- Before we started this one
end record;
-- Data used to handle the curly operator and the plus and star
-- operators for complex expressions.
Current_Curly : Current_Curly_Access := null;
-- The curly currently being processed.
-----------------------
-- Local Subprograms --
-----------------------
function Index (Start : Positive; C : Character) return Natural;
-- Find character C in Data starting at Start and return position
function Repeat
(IP : Pointer;
Max : Natural := Natural'Last) return Natural;
-- Repeatedly match something simple, report how many
-- It only matches on things of length 1.
-- Starting from Input_Pos, it matches at most Max CURLY.
function Try (Pos : in Positive) return Boolean;
-- Try to match at specific point
function Match (IP : Pointer) return Boolean;
-- This is the main matching routine. Conceptually the strategy
-- is simple: check to see whether the current node matches,
-- call self recursively to see whether the rest matches,
-- and then act accordingly.
--
-- In practice Match makes some effort to avoid recursion, in
-- particular by going through "ordinary" nodes (that don't
-- need to know whether the rest of the match failed) by
-- using a loop instead of recursion.
-- Why is the above comment part of the spec rather than body ???
function Match_Whilem (IP : Pointer) return Boolean;
-- Return True if a WHILEM matches
-- How come IP is unreferenced in the body ???
function Recurse_Match (IP : Pointer; From : Natural) return Boolean;
pragma Inline (Recurse_Match);
-- Calls Match recursively. It saves and restores the parenthesis
-- status and location in the input stream correctly, so that
-- backtracking is possible
function Match_Simple_Operator
(Op : Opcode;
Scan : Pointer;
Next : Pointer;
Greedy : Boolean) return Boolean;
-- Return True it the simple operator (possibly non-greedy) matches
pragma Inline (Index);
pragma Inline (Repeat);
-- These are two complex functions, but used only once.
pragma Inline (Match_Whilem);
pragma Inline (Match_Simple_Operator);
-----------
-- Index --
-----------
function Index (Start : Positive; C : Character) return Natural is
begin
for J in Start .. Last_In_Data loop
if Data (J) = C then
return J;
end if;
end loop;
return 0;
end Index;
-------------------
-- Recurse_Match --
-------------------
function Recurse_Match (IP : Pointer; From : Natural) return Boolean is
L : constant Natural := Last_Paren;
Tmp_F : constant Match_Array :=
Matches_Full (From + 1 .. Matches_Full'Last);
Start : constant Natural_Array :=
Matches_Tmp (From + 1 .. Matches_Tmp'Last);
Input : constant Natural := Input_Pos;
begin
if Match (IP) then
return True;
end if;
Last_Paren := L;
Matches_Full (Tmp_F'Range) := Tmp_F;
Matches_Tmp (Start'Range) := Start;
Input_Pos := Input;
return False;
end Recurse_Match;
-----------
-- Match --
-----------
function Match (IP : Pointer) return Boolean is
Scan : Pointer := IP;
Next : Pointer;
Op : Opcode;
begin
State_Machine :
loop
pragma Assert (Scan /= 0);
-- Determine current opcode and count its usage in debug mode
Op := Opcode'Val (Character'Pos (Program (Scan)));
-- Calculate offset of next instruction.
-- Second character is most significant in Program_Data.
Next := Get_Next (Program, Scan);
case Op is
when EOP =>
return True; -- Success !
when BRANCH =>
if Program (Next) /= BRANCH then
Next := Operand (Scan); -- No choice, avoid recursion
else
loop
if Recurse_Match (Operand (Scan), 0) then
return True;
end if;
Scan := Get_Next (Program, Scan);
exit when Scan = 0 or else Program (Scan) /= BRANCH;
end loop;
exit State_Machine;
end if;
when NOTHING =>
null;
when BOL =>
exit State_Machine when Input_Pos /= BOL_Pos
and then ((Self.Flags and Multiple_Lines) = 0
or else Data (Input_Pos - 1) /= ASCII.LF);
when MBOL =>
exit State_Machine when Input_Pos /= BOL_Pos
and then Data (Input_Pos - 1) /= ASCII.LF;
when SBOL =>
exit State_Machine when Input_Pos /= BOL_Pos;
when EOL =>
exit State_Machine when Input_Pos <= Data'Last
and then ((Self.Flags and Multiple_Lines) = 0
or else Data (Input_Pos) /= ASCII.LF);
when MEOL =>
exit State_Machine when Input_Pos <= Data'Last
and then Data (Input_Pos) /= ASCII.LF;
when SEOL =>
exit State_Machine when Input_Pos <= Data'Last;
when BOUND | NBOUND =>
-- Was last char in word ?
declare
N : Boolean := False;
Ln : Boolean := False;
begin
if Input_Pos /= First_In_Data then
N := Is_Alnum (Data (Input_Pos - 1));
end if;
if Input_Pos > Last_In_Data then
Ln := False;
else
Ln := Is_Alnum (Data (Input_Pos));
end if;
if Op = BOUND then
if N = Ln then
exit State_Machine;
end if;
else
if N /= Ln then
exit State_Machine;
end if;
end if;
end;
when SPACE =>
exit State_Machine when Input_Pos > Last_In_Data
or else not Is_White_Space (Data (Input_Pos));
Input_Pos := Input_Pos + 1;
when NSPACE =>
exit State_Machine when Input_Pos > Last_In_Data
or else Is_White_Space (Data (Input_Pos));
Input_Pos := Input_Pos + 1;
when DIGIT =>
exit State_Machine when Input_Pos > Last_In_Data
or else not Is_Digit (Data (Input_Pos));
Input_Pos := Input_Pos + 1;
when NDIGIT =>
exit State_Machine when Input_Pos > Last_In_Data
or else Is_Digit (Data (Input_Pos));
Input_Pos := Input_Pos + 1;
when ALNUM =>
exit State_Machine when Input_Pos > Last_In_Data
or else not Is_Alnum (Data (Input_Pos));
Input_Pos := Input_Pos + 1;
when NALNUM =>
exit State_Machine when Input_Pos > Last_In_Data
or else Is_Alnum (Data (Input_Pos));
Input_Pos := Input_Pos + 1;
when ANY =>
exit State_Machine when Input_Pos > Last_In_Data
or else Data (Input_Pos) = ASCII.LF;
Input_Pos := Input_Pos + 1;
when SANY =>
exit State_Machine when Input_Pos > Last_In_Data;
Input_Pos := Input_Pos + 1;
when EXACT =>
declare
Opnd : Pointer := String_Operand (Scan);
Current : Positive := Input_Pos;
Last : constant Pointer :=
Opnd + String_Length (Program, Scan);
begin
while Opnd <= Last loop
exit State_Machine when Current > Last_In_Data
or else Program (Opnd) /= Data (Current);
Current := Current + 1;
Opnd := Opnd + 1;
end loop;
Input_Pos := Current;
end;
when EXACTF =>
declare
Opnd : Pointer := String_Operand (Scan);
Current : Positive := Input_Pos;
Last : constant Pointer :=
Opnd + String_Length (Program, Scan);
begin
while Opnd <= Last loop
exit State_Machine when Current > Last_In_Data
or else Program (Opnd) /= To_Lower (Data (Current));
Current := Current + 1;
Opnd := Opnd + 1;
end loop;
Input_Pos := Current;
end;
when ANYOF =>
declare
Bitmap : Character_Class;
begin
Bitmap_Operand (Program, Scan, Bitmap);
exit State_Machine when Input_Pos > Last_In_Data
or else not Get_From_Class (Bitmap, Data (Input_Pos));
Input_Pos := Input_Pos + 1;
end;
when OPEN =>
declare
No : constant Natural :=
Character'Pos (Program (Operand (Scan)));
begin
Matches_Tmp (No) := Input_Pos;
end;
when CLOSE =>
declare
No : constant Natural :=
Character'Pos (Program (Operand (Scan)));
begin
Matches_Full (No) := (Matches_Tmp (No), Input_Pos - 1);
if Last_Paren < No then
Last_Paren := No;
end if;
end;
when REFF =>
declare
No : constant Natural :=
Character'Pos (Program (Operand (Scan)));
Data_Pos : Natural;
begin
-- If we haven't seen that parenthesis yet
if Last_Paren < No then
return False;
end if;
Data_Pos := Matches_Full (No).First;
while Data_Pos <= Matches_Full (No).Last loop
if Input_Pos > Last_In_Data
or else Data (Input_Pos) /= Data (Data_Pos)
then
return False;
end if;
Input_Pos := Input_Pos + 1;
Data_Pos := Data_Pos + 1;
end loop;
end;
when MINMOD =>
Greedy := False;
when STAR | PLUS | CURLY =>
declare
Greed : constant Boolean := Greedy;
begin
Greedy := True;
return Match_Simple_Operator (Op, Scan, Next, Greed);
end;
when CURLYX =>
-- Looking at something like:
-- 1: CURLYX {n,m} (->4)
-- 2: code for complex thing (->3)
-- 3: WHILEM (->0)
-- 4: NOTHING
declare
Min : constant Natural :=
Read_Natural (Program, Scan + 3);
Max : constant Natural :=
Read_Natural (Program, Scan + 5);
Cc : aliased Current_Curly_Record;
Has_Match : Boolean;
begin
Cc := (Paren_Floor => Last_Paren,
Cur => -1,
Min => Min,
Max => Max,
Greedy => Greedy,
Scan => Scan + 7,
Next => Next,
Lastloc => 0,
Old_Cc => Current_Curly);
Current_Curly := Cc'Unchecked_Access;
Has_Match := Match (Next - 3);
-- Start on the WHILEM
Current_Curly := Cc.Old_Cc;
return Has_Match;
end;
when WHILEM =>
return Match_Whilem (IP);
end case;
Scan := Next;
end loop State_Machine;
-- If we get here, there is no match.
-- For successful matches when EOP is the terminating point.
return False;
end Match;
---------------------------
-- Match_Simple_Operator --
---------------------------
function Match_Simple_Operator
(Op : Opcode;
Scan : Pointer;
Next : Pointer;
Greedy : Boolean) return Boolean
is
Next_Char : Character := ASCII.Nul;
Next_Char_Known : Boolean := False;
No : Integer; -- Can be negative
Min : Natural;
Max : Natural := Natural'Last;
Operand_Code : Pointer;
Old : Natural;
Last_Pos : Natural;
Save : constant Natural := Input_Pos;
begin
-- Lookahead to avoid useless match attempts
-- when we know what character comes next.
if Program (Next) = EXACT then
Next_Char := Program (String_Operand (Next));
Next_Char_Known := True;
end if;
-- Find the minimal and maximal values for the operator
case Op is
when STAR =>
Min := 0;
Operand_Code := Operand (Scan);
when PLUS =>
Min := 1;
Operand_Code := Operand (Scan);
when others =>
Min := Read_Natural (Program, Scan + 3);
Max := Read_Natural (Program, Scan + 5);
Operand_Code := Scan + 7;
end case;
-- Non greedy operators
if not Greedy then
-- Test the minimal repetitions
if Min /= 0
and then Repeat (Operand_Code, Min) < Min
then
return False;
end if;
Old := Input_Pos;
-- Find the place where 'next' could work
if Next_Char_Known then
-- Last position to check
Last_Pos := Input_Pos + Max;
if Last_Pos > Last_In_Data
or else Max = Natural'Last
then
Last_Pos := Last_In_Data;
end if;
-- Look for the first possible opportunity
loop
-- Find the next possible position
while Input_Pos <= Last_Pos
and then Data (Input_Pos) /= Next_Char
loop
Input_Pos := Input_Pos + 1;
end loop;
if Input_Pos > Last_Pos then
return False;
end if;
-- Check that we still match if we stop
-- at the position we just found.
declare
Num : constant Natural := Input_Pos - Old;
begin
Input_Pos := Old;
if Repeat (Operand_Code, Num) < Num then
return False;
end if;
end;
-- Input_Pos now points to the new position
if Match (Get_Next (Program, Scan)) then
return True;
end if;
Old := Input_Pos;
Input_Pos := Input_Pos + 1;
end loop;
-- We know what the next character is
else
while Max >= Min loop
-- If the next character matches
if Match (Next) then
return True;
end if;
Input_Pos := Save + Min;
-- Could not or did not match -- move forward
if Repeat (Operand_Code, 1) /= 0 then
Min := Min + 1;
else
return False;
end if;
end loop;
end if;
return False;
-- Greedy operators
else
No := Repeat (Operand_Code, Max);
-- ??? Perl has some special code here in case the
-- next instruction is of type EOL, since $ and \Z
-- can match before *and* after newline at the end.
-- ??? Perl has some special code here in case (paren)
-- is True.
-- Else, if we don't have any parenthesis
while No >= Min loop
if not Next_Char_Known
or else (Input_Pos <= Last_In_Data
and then Data (Input_Pos) = Next_Char)
then
if Match (Next) then
return True;
end if;
end if;
-- Could not or did not work, we back up
No := No - 1;
Input_Pos := Save + No;
end loop;
return False;
end if;
end Match_Simple_Operator;
------------------
-- Match_Whilem --
------------------
-- This is really hard to understand, because after we match what we
-- are trying to match, we must make sure the rest of the REx is going
-- to match for sure, and to do that we have to go back UP the parse
-- tree by recursing ever deeper. And if it fails, we have to reset
-- our parent's current state that we can try again after backing off.
function Match_Whilem (IP : Pointer) return Boolean is
pragma Unreferenced (IP);
Cc : constant Current_Curly_Access := Current_Curly;
N : constant Natural := Cc.Cur + 1;
Ln : Natural := 0;
Lastloc : constant Natural := Cc.Lastloc;
-- Detection of 0-len.
begin
-- If degenerate scan matches "", assume scan done.
if Input_Pos = Cc.Lastloc
and then N >= Cc.Min
then
-- Temporarily restore the old context, and check that we
-- match was comes after CURLYX.
Current_Curly := Cc.Old_Cc;
if Current_Curly /= null then
Ln := Current_Curly.Cur;
end if;
if Match (Cc.Next) then
return True;
end if;
if Current_Curly /= null then
Current_Curly.Cur := Ln;
end if;
Current_Curly := Cc;
return False;
end if;
-- First, just match a string of min scans.
if N < Cc.Min then
Cc.Cur := N;
Cc.Lastloc := Input_Pos;
if Match (Cc.Scan) then
return True;
end if;
Cc.Cur := N - 1;
Cc.Lastloc := Lastloc;
return False;
end if;
-- Prefer next over scan for minimal matching.
if not Cc.Greedy then
Current_Curly := Cc.Old_Cc;
if Current_Curly /= null then
Ln := Current_Curly.Cur;
end if;
if Recurse_Match (Cc.Next, Cc.Paren_Floor) then
return True;
end if;
if Current_Curly /= null then
Current_Curly.Cur := Ln;
end if;
Current_Curly := Cc;
-- Maximum greed exceeded ?
if N >= Cc.Max then
return False;
end if;
-- Try scanning more and see if it helps
Cc.Cur := N;
Cc.Lastloc := Input_Pos;
if Recurse_Match (Cc.Scan, Cc.Paren_Floor) then
return True;
end if;
Cc.Cur := N - 1;
Cc.Lastloc := Lastloc;
return False;
end if;
-- Prefer scan over next for maximal matching
if N < Cc.Max then -- more greed allowed ?
Cc.Cur := N;
Cc.Lastloc := Input_Pos;
if Recurse_Match (Cc.Scan, Cc.Paren_Floor) then
return True;
end if;
end if;
-- Failed deeper matches of scan, so see if this one works
Current_Curly := Cc.Old_Cc;
if Current_Curly /= null then
Ln := Current_Curly.Cur;
end if;
if Match (Cc.Next) then
return True;
end if;
if Current_Curly /= null then
Current_Curly.Cur := Ln;
end if;
Current_Curly := Cc;
Cc.Cur := N - 1;
Cc.Lastloc := Lastloc;
return False;
end Match_Whilem;
------------
-- Repeat --
------------
function Repeat
(IP : Pointer;
Max : Natural := Natural'Last) return Natural
is
Scan : Natural := Input_Pos;
Last : Natural;
Op : constant Opcode := Opcode'Val (Character'Pos (Program (IP)));
Count : Natural;
C : Character;
Is_First : Boolean := True;
Bitmap : Character_Class;
begin
if Max = Natural'Last or else Scan + Max - 1 > Last_In_Data then
Last := Last_In_Data;
else
Last := Scan + Max - 1;
end if;
case Op is
when ANY =>
while Scan <= Last
and then Data (Scan) /= ASCII.LF
loop
Scan := Scan + 1;
end loop;
when SANY =>
Scan := Last + 1;
when EXACT =>
-- The string has only one character if Repeat was called
C := Program (String_Operand (IP));
while Scan <= Last
and then C = Data (Scan)
loop
Scan := Scan + 1;
end loop;
when EXACTF =>
-- The string has only one character if Repeat was called
C := Program (String_Operand (IP));
while Scan <= Last
and then To_Lower (C) = Data (Scan)
loop
Scan := Scan + 1;
end loop;
when ANYOF =>
if Is_First then
Bitmap_Operand (Program, IP, Bitmap);
Is_First := False;
end if;
while Scan <= Last
and then Get_From_Class (Bitmap, Data (Scan))
loop
Scan := Scan + 1;
end loop;
when ALNUM =>
while Scan <= Last
and then Is_Alnum (Data (Scan))
loop
Scan := Scan + 1;
end loop;
when NALNUM =>
while Scan <= Last
and then not Is_Alnum (Data (Scan))
loop
Scan := Scan + 1;
end loop;
when SPACE =>
while Scan <= Last
and then Is_White_Space (Data (Scan))
loop
Scan := Scan + 1;
end loop;
when NSPACE =>
while Scan <= Last
and then not Is_White_Space (Data (Scan))
loop
Scan := Scan + 1;
end loop;
when DIGIT =>
while Scan <= Last
and then Is_Digit (Data (Scan))
loop
Scan := Scan + 1;
end loop;
when NDIGIT =>
while Scan <= Last
and then not Is_Digit (Data (Scan))
loop
Scan := Scan + 1;
end loop;
when others =>
raise Program_Error;
end case;
Count := Scan - Input_Pos;
Input_Pos := Scan;
return Count;
end Repeat;
---------
-- Try --
---------
function Try (Pos : in Positive) return Boolean is
begin
Input_Pos := Pos;
Last_Paren := 0;
Matches_Full := (others => No_Match);
if Match (Program_First + 1) then
Matches_Full (0) := (Pos, Input_Pos - 1);
return True;
end if;
return False;
end Try;
-- Start of processing for Match
begin
-- Do we have the regexp Never_Match?
if Self.Size = 0 then
Matches (0) := No_Match;
return;
end if;
-- Check validity of program
pragma Assert
(Program (Program_First) = MAGIC,
"Corrupted Pattern_Matcher");
-- If there is a "must appear" string, look for it
if Self.Must_Have_Length > 0 then
declare
First : constant Character := Program (Self.Must_Have);
Must_First : constant Pointer := Self.Must_Have;
Must_Last : constant Pointer :=
Must_First + Pointer (Self.Must_Have_Length - 1);
Next_Try : Natural := Index (First_In_Data, First);
begin
while Next_Try /= 0
and then Data (Next_Try .. Next_Try + Self.Must_Have_Length - 1)
= String (Program (Must_First .. Must_Last))
loop
Next_Try := Index (Next_Try + 1, First);
end loop;
if Next_Try = 0 then
Matches_Full := (others => No_Match);
return; -- Not present
end if;
end;
end if;
-- Mark beginning of line for ^
BOL_Pos := Data'First;
-- Simplest case first: an anchored match need be tried only once
if Self.Anchored and then (Self.Flags and Multiple_Lines) = 0 then
Matched := Try (First_In_Data);
elsif Self.Anchored then
declare
Next_Try : Natural := First_In_Data;
begin
-- Test the first position in the buffer
Matched := Try (Next_Try);
-- Else only test after newlines
if not Matched then
while Next_Try <= Last_In_Data loop
while Next_Try <= Last_In_Data
and then Data (Next_Try) /= ASCII.LF
loop
Next_Try := Next_Try + 1;
end loop;
Next_Try := Next_Try + 1;
if Next_Try <= Last_In_Data then
Matched := Try (Next_Try);
exit when Matched;
end if;
end loop;
end if;
end;
elsif Self.First /= ASCII.NUL then
-- We know what char it must start with
declare
Next_Try : Natural := Index (First_In_Data, Self.First);
begin
while Next_Try /= 0 loop
Matched := Try (Next_Try);
exit when Matched;
Next_Try := Index (Next_Try + 1, Self.First);
end loop;
end;
else
-- Messy cases: try all locations (including for the empty string)
Matched := Try (First_In_Data);
if not Matched then
for S in First_In_Data + 1 .. Last_In_Data loop
Matched := Try (S);
exit when Matched;
end loop;
end if;
end if;
-- Matched has its value
for J in Last_Paren + 1 .. Matches'Last loop
Matches_Full (J) := No_Match;
end loop;
Matches := Matches_Full (Matches'Range);
return;
end Match;
-----------
-- Match --
-----------
function Match
(Self : Pattern_Matcher;
Data : String;
Data_First : Integer := -1;
Data_Last : Positive := Positive'Last) return Natural
is
Matches : Match_Array (0 .. 0);
begin
Match (Self, Data, Matches, Data_First, Data_Last);
if Matches (0) = No_Match then
return Data'First - 1;
else
return Matches (0).First;
end if;
end Match;
function Match
(Self : Pattern_Matcher;
Data : String;
Data_First : Integer := -1;
Data_Last : Positive := Positive'Last) return Boolean
is
Matches : Match_Array (0 .. 0);
begin
Match (Self, Data, Matches, Data_First, Data_Last);
return Matches (0).First >= Data'First;
end Match;
procedure Match
(Expression : String;
Data : String;
Matches : out Match_Array;
Size : Program_Size := Auto_Size;
Data_First : Integer := -1;
Data_Last : Positive := Positive'Last)
is
PM : Pattern_Matcher (Size);
Finalize_Size : Program_Size;
begin
if Size = 0 then
Match (Compile (Expression), Data, Matches, Data_First, Data_Last);
else
Compile (PM, Expression, Finalize_Size);
Match (PM, Data, Matches, Data_First, Data_Last);
end if;
end Match;
-----------
-- Match --
-----------
function Match
(Expression : String;
Data : String;
Size : Program_Size := Auto_Size;
Data_First : Integer := -1;
Data_Last : Positive := Positive'Last) return Natural
is
PM : Pattern_Matcher (Size);
Final_Size : Program_Size; -- unused
begin
if Size = 0 then
return Match (Compile (Expression), Data, Data_First, Data_Last);
else
Compile (PM, Expression, Final_Size);
return Match (PM, Data, Data_First, Data_Last);
end if;
end Match;
-----------
-- Match --
-----------
function Match
(Expression : String;
Data : String;
Size : Program_Size := Auto_Size;
Data_First : Integer := -1;
Data_Last : Positive := Positive'Last) return Boolean
is
Matches : Match_Array (0 .. 0);
PM : Pattern_Matcher (Size);
Final_Size : Program_Size; -- unused
begin
if Size = 0 then
Match (Compile (Expression), Data, Matches, Data_First, Data_Last);
else
Compile (PM, Expression, Final_Size);
Match (PM, Data, Matches, Data_First, Data_Last);
end if;
return Matches (0).First >= Data'First;
end Match;
-------------
-- Operand --
-------------
function Operand (P : Pointer) return Pointer is
begin
return P + 3;
end Operand;
--------------
-- Optimize --
--------------
procedure Optimize (Self : in out Pattern_Matcher) is
Max_Length : Program_Size;
This_Length : Program_Size;
Longest : Pointer;
Scan : Pointer;
Program : Program_Data renames Self.Program;
begin
-- Start with safe defaults (no optimization):
-- * No known first character of match
-- * Does not necessarily start at beginning of line
-- * No string known that has to appear in data
Self.First := ASCII.NUL;
Self.Anchored := False;
Self.Must_Have := Program'Last + 1;
Self.Must_Have_Length := 0;
Scan := Program_First + 1; -- First instruction (can be anything)
if Program (Scan) = EXACT then
Self.First := Program (String_Operand (Scan));
elsif Program (Scan) = BOL
or else Program (Scan) = SBOL
or else Program (Scan) = MBOL
then
Self.Anchored := True;
end if;
-- If there's something expensive in the regexp, find the
-- longest literal string that must appear and make it the
-- regmust. Resolve ties in favor of later strings, since
-- the regstart check works with the beginning of the regexp.
-- and avoiding duplication strengthens checking. Not a
-- strong reason, but sufficient in the absence of others.
if False then -- if Flags.SP_Start then ???
Longest := 0;
Max_Length := 0;
while Scan /= 0 loop
if Program (Scan) = EXACT or else Program (Scan) = EXACTF then
This_Length := String_Length (Program, Scan);
if This_Length >= Max_Length then
Longest := String_Operand (Scan);
Max_Length := This_Length;
end if;
end if;
Scan := Get_Next (Program, Scan);
end loop;
Self.Must_Have := Longest;
Self.Must_Have_Length := Natural (Max_Length) + 1;
end if;
end Optimize;
-----------------
-- Paren_Count --
-----------------
function Paren_Count (Regexp : Pattern_Matcher) return Match_Count is
begin
return Regexp.Paren_Count;
end Paren_Count;
-----------
-- Quote --
-----------
function Quote (Str : String) return String is
S : String (1 .. Str'Length * 2);
Last : Natural := 0;
begin
for J in Str'Range loop
case Str (J) is
when '^' | '$' | '|' | '*' | '+' | '?' | '{' |
'}' | '[' | ']' | '(' | ')' | '\' =>
S (Last + 1) := '\';
S (Last + 2) := Str (J);
Last := Last + 2;
when others =>
S (Last + 1) := Str (J);
Last := Last + 1;
end case;
end loop;
return S (1 .. Last);
end Quote;
------------------
-- Read_Natural --
------------------
function Read_Natural
(Program : Program_Data;
IP : Pointer) return Natural
is
begin
return Character'Pos (Program (IP)) +
256 * Character'Pos (Program (IP + 1));
end Read_Natural;
-----------------
-- Reset_Class --
-----------------
procedure Reset_Class (Bitmap : out Character_Class) is
begin
Bitmap := (others => 0);
end Reset_Class;
------------------
-- Set_In_Class --
------------------
procedure Set_In_Class
(Bitmap : in out Character_Class;
C : Character)
is
Value : constant Class_Byte := Character'Pos (C);
begin
Bitmap (Value / 8) := Bitmap (Value / 8)
or Bit_Conversion (Value mod 8);
end Set_In_Class;
-------------------
-- String_Length --
-------------------
function String_Length
(Program : Program_Data;
P : Pointer) return Program_Size
is
begin
pragma Assert (Program (P) = EXACT or else Program (P) = EXACTF);
return Character'Pos (Program (P + 3));
end String_Length;
--------------------
-- String_Operand --
--------------------
function String_Operand (P : Pointer) return Pointer is
begin
return P + 4;
end String_Operand;
end GNAT.Regpat;