-- ---------------------------------------------------------------------------- -- -- Copyright (C) 2009, 2010 Stephen Leake -- Copyright (C) 1999, 2000 Christoph Karl Walter Grein -- -- This file is part of the OpenToken package. -- -- The OpenToken package is free software; you can redistribute it and/or -- modify it under the terms of the GNU General Public License as published -- by the Free Software Foundation; either version 3, or (at your option) -- any later version. The OpenToken package is distributed in the hope that -- it will be useful, but WITHOUT ANY WARRANTY; without even the implied -- warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -- GNU General Public License for more details. You should have received -- a copy of the GNU General Public License distributed with the OpenToken -- package; see file GPL.txt. If not, write to the Free Software Foundation, -- 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. -- -- As a special exception, if other files instantiate generics from -- this unit, or you link this unit with other files to produce an -- executable, this unit does not by itself cause the resulting -- executable to be covered by the GNU General Public License. This -- exception does not however invalidate any other reasons why the -- executable file might be covered by the GNU Public License. -- ---------------------------------------------------------------------------- with Ada.Characters.Latin_1; with Ada.Strings.Maps.Constants; with OpenToken.Recognizer.Bracketed_Comment; with OpenToken.Recognizer.Character_Set; with OpenToken.Recognizer.End_Of_File; with OpenToken.Recognizer.HTML_Entity; with OpenToken.Recognizer.Keyword; with OpenToken.Recognizer.Nothing; with OpenToken.Recognizer.Separator; with OpenToken.Recognizer.String; package body HTML_Lexer is function Name (Token : HTML_Token) return Token_Name is begin return Token.Name; end Name; function Lexeme (Token : HTML_Token) return Standard.String is begin return Ada.Strings.Unbounded.To_String (Token.Lexeme); end Lexeme; function Line (Token : in HTML_Token) return Natural is begin return Token.Line; end Line; function Column (Token : in HTML_Token) return Natural is begin return Token.Column; end Column; use type Ada.Strings.Maps.Character_Set; HTML_Whitespace : constant Ada.Strings.Maps.Character_Set := Ada.Strings.Maps.To_Set (Ada.Characters.Latin_1.HT & Ada.Characters.Latin_1.CR & Ada.Characters.Latin_1.LF & Ada.Characters.Latin_1.Space); function Text_Syntax return Tokenizer.Syntax is begin return (Document_Type => Tokenizer.Get (OpenToken.Recognizer.Bracketed_Comment.Get (Comment_Opener => " ">", Reportable => True)), Start_Tag_Opener => Tokenizer.Get (OpenToken.Recognizer.Separator.Get ("<")), End_Tag_Opener => Tokenizer.Get (OpenToken.Recognizer.Separator.Get ("")), Text => Tokenizer.Get (OpenToken.Recognizer.Character_Set.Get (Ada.Strings.Maps.Constants.Graphic_Set - Ada.Strings.Maps.To_Set ("<&"), Reportable => True)), Entity => Tokenizer.Get (OpenToken.Recognizer.HTML_Entity.Get), -- See HTML definition section 3.2.4 Comments; HTML syntax -- actually allows whitespace in the comment closer: "-- >" -- is a closer. That also means that "" is -- invalid syntax. But we don't have a recognizer that can -- deal with that, and this is good enough for common usage. Comment => Tokenizer.Get (OpenToken.Recognizer.Bracketed_Comment.Get (Comment_Opener => "", Reportable => True)), Whitespace => Tokenizer.Get (OpenToken.Recognizer.Character_Set.Get (HTML_Whitespace)), Bad_Token => Tokenizer.Get (OpenToken.Recognizer.Nothing.Get), End_Of_File => Tokenizer.Get (OpenToken.Recognizer.End_Of_File.Get), Pre => Tokenizer.Get (OpenToken.Recognizer.Bracketed_Comment.Get (Comment_Opener => "
", Comment_Closer => "", Reportable => True)), others => Tokenizer.Get (OpenToken.Recognizer.Nothing.Get)); end Text_Syntax; function Tag_Syntax return Tokenizer.Syntax is begin return (Tag_Closer => Tokenizer.Get (OpenToken.Recognizer.Separator.Get (">")), HTML => Tokenizer.Get (OpenToken.Recognizer.Keyword.Get ("HTML")), Head => Tokenizer.Get (OpenToken.Recognizer.Keyword.Get ("Head")), Meta => Tokenizer.Get (OpenToken.Recognizer.Keyword.Get ("Meta")), HTML_Body => Tokenizer.Get (OpenToken.Recognizer.Keyword.Get ("Body")), Heading_1 => Tokenizer.Get (OpenToken.Recognizer.Keyword.Get ("H1")), Anchor => Tokenizer.Get (OpenToken.Recognizer.Keyword.Get ("A")), Image => Tokenizer.Get (OpenToken.Recognizer.Keyword.Get ("IMG")), Content => Tokenizer.Get (OpenToken.Recognizer.Keyword.Get ("CONTENT")), Hyper_Reference => Tokenizer.Get (OpenToken.Recognizer.Keyword.Get ("HREF")), Link_Type => Tokenizer.Get (OpenToken.Recognizer.Keyword.Get ("TYPE")), Name => Tokenizer.Get (OpenToken.Recognizer.Keyword.Get ("NAME")), Source => Tokenizer.Get (OpenToken.Recognizer.Keyword.Get ("SRC")), Title => Tokenizer.Get (OpenToken.Recognizer.Keyword.Get ("TITLE")), Assignment => Tokenizer.Get (OpenToken.Recognizer.Separator.Get ("=")), Value => Tokenizer.Get (OpenToken.Recognizer.Character_Set.Get (Ada.Strings.Maps.Constants.Letter_Set or Ada.Strings.Maps.Constants.Decimal_Digit_Set or Ada.Strings.Maps.To_Set (".-_/:#?',*%"), -- unquoted URIs, framesets Reportable => True)), String => Tokenizer.Get (OpenToken.Recognizer.String.Get (Double_Delimiter => False)), Whitespace => Tokenizer.Get (OpenToken.Recognizer.Character_Set.Get (HTML_Whitespace)), Bad_Token => Tokenizer.Get (OpenToken.Recognizer.Nothing.Get), End_Of_File => Tokenizer.Get (OpenToken.Recognizer.End_Of_File.Get), others => Tokenizer.Get (OpenToken.Recognizer.Nothing.Get)); end Tag_Syntax; end HTML_Lexer;