--  part of ParserTools, (c) 2017 Felix Krause
--  released under the terms of the MIT license, see the file "copying.txt"

with Ada.Containers;
with Ada.Strings.UTF_Encoding;
with Ada.Strings.Unbounded;
with System.Storage_Elements;

package Text is
   --  this package defines a reference-counted string pointer type. it is used
   --  for all YAML data entities and relieves the user from the need to
   --  manually dispose events created by the parser.
   --
   --  typically, YAML content strings are deallocated in the same order as they
   --  are allocated. this knowledge is built into a storage pool for efficient
   --  memory usage and to avoid fragmentation.
   --
   --  to be able to efficiently interface with C, this package allocates its
   --  strings so that they can directly be passed on to C without the need to
   --  copy any data. Use the subroutines Export and Delete_Exported to get
   --  C-compatible string values from a Reference. these subroutines also
   --  take care of reference counting for values exposed to C. this means that
   --  after exporting a value, you *must* eventually call Delete_Exported in
   --  order for the value to be freed.
   --
   --  HINT: this package makes use of compiler implementation details and may
   --  not work with other compilers. however, since there currently are no
   --  Ada 2012 compilers but GNAT, this is not considered a problem.

   --  the pool allocates the memory it uses on the heap. it is allowed for the
   --  pool to vanish while References created on it are still around. the
   --  heap memory is reclaimed when the pool itself and all References
   --  created by it vanish.
   --
   --  this type has pointer semantics in order to allow the usage of the same
   --  pool at different places without the need of access types. copying a
   --  value of this type will make both values use the same memory. use Create
   --  to generate a new independent pool.

   --  all strings generated by Yaml are encoded in UTF-8, regardless of input
   --  encoding.
   subtype UTF_8_String is Ada.Strings.UTF_Encoding.UTF_8_String;
   type UTF_8_String_Access is access UTF_8_String;

   subtype Pool_Offset is System.Storage_Elements.Storage_Offset
   range 0 .. System.Storage_Elements.Storage_Offset (Integer'Last);

   --  this is a smart pointer. use Value to access its value.
   subtype Reference is Ada.Strings.Unbounded.Unbounded_String;

   --  shortcut for Object.Value.Data'Length
   function Length (Object : Reference) return Natural renames Ada.Strings.Unbounded.Length;

   function "&" (Left, Right : Reference) return String;

   --  compares the string content of two Content values.
   function "=" (Left, Right : Reference) return Boolean renames Ada.Strings.Unbounded."=";

   function Hash (Object : Reference) return Ada.Containers.Hash_Type;

   --  equivalent to the empty string. default value for References.
   Empty : constant Reference;

   --  this can be used for constant Reference values that are declared at
   --  library level where no Pool is available. References pointing to a
   --  Constant_Content_Holder are never freed.
   subtype Constant_Instance is Reference;

   --  note that there is a limit of 128 characters for Content values created
   --  like this.
   function Hold (Content : String) return Constant_Instance
     renames Ada.Strings.Unbounded.To_Unbounded_String;

   --  get a Reference value which is a reference to the string contained in the
   --  Holder.
   function Held (Holder : Constant_Instance) return Reference is (Holder);

private
   --  this forces GNAT to store the First and Last dope values right before
   --  the first element of the String. we use that to our advantage.
   for UTF_8_String_Access'Size use Standard'Address_Size;

   type Chunk_Index_Type is range 1 .. 10;
   subtype Refcount_Type is Integer range 0 .. 2 ** 24 - 1;

   --  the pool consists of multiple chunks of memory. strings are allocated
   --  inside the chunks.
   type Pool_Array is array (Pool_Offset range <>) of System.Storage_Elements.Storage_Element;
   type Chunk is access Pool_Array;
   type Chunk_Array is array (Chunk_Index_Type) of Chunk;
   type Usage_Array is array (Chunk_Index_Type) of Natural;

   --  the idea is that Cur is the pointer to the active Chunk. all new strings
   --  are allocated in that active Chunk until there is no more space. then,
   --  we allocate a new Chunk of twice the size of the current one and make
   --  that the current Chunk. the old Chunk lives on until all Content strings
   --  allocated on it vanish. Usage is the number of strings currently
   --  allocated on the Chunk and is used as reference count. the current Chunk
   --  has Usage + 1 which prevents its deallocation even if the last Content
   --  string on it vanishes. the Content type's finalization takes care of
   --  decrementing the Usage value that counts allocated strings, while the
   --  String_Pool type's deallocation takes care of removing the +1 for the
   --  current Chunk.
   --
   --  we treat each Chunk basically as a bitvector ring list, and Pos is the
   --  current offset in the current Chunk. instead of having a full bitvector
   --  for allocating, we use the dope values from the strings that stay in the
   --  memory after deallocation. besides the First and Last values, we also
   --  store a reference count in the dope. so when searching for a place to
   --  allocate a new string, we can skip over regions that have a non-zero
   --  reference count in their header, and those with a 0 reference count are
   --  available space. compared to a real bitvector, we always have the
   --  information of the length of an free region available. we can avoid
   --  fragmentation by merging a region that is freed with the surrounding free
   --  regions.
   type Pool_Data is record
      Refcount : Refcount_Type := 1;
      Chunks : Chunk_Array;
      Usage  : Usage_Array := (1 => 1, others => 0);
      Cur    : Chunk_Index_Type := 1;
      Pos    : Pool_Offset;
   end record;

   type Pool_Data_Access is access Pool_Data;
   for Pool_Data_Access'Size use Standard'Address_Size;

   --  this is the dope vector of each string allocated in a Chunk. it is put
   --  immediately before the string's value. note that the First and Last
   --  elements are at the exact positions where GNAT searches for the string's
   --  boundary dope. this allows us to access those values for maintaining the
   --  ring list.
   type Header is record
      Pool : Pool_Data_Access;
      Chunk_Index : Chunk_Index_Type;
      Refcount : Refcount_Type := 1;
      First, Last : Pool_Offset;
   end record;

   Chunk_Index_Start : constant := Standard'Address_Size;
   Refcount_Start    : constant := Standard'Address_Size + 8;
   First_Start       : constant := Standard'Address_Size + 32;
   Last_Start        : constant := First_Start + Integer'Size;
   Header_End        : constant := Last_Start + Integer'Size - 1;

   for Header use record
      Pool        at 0 range 0 .. Chunk_Index_Start - 1;
      Chunk_Index at 0 range Chunk_Index_Start .. Refcount_Start - 1;
      Refcount    at 0 range Refcount_Start .. First_Start - 1;
      First       at 0 range First_Start .. Last_Start - 1;
      Last        at 0 range Last_Start .. Header_End;
   end record;
   for Header'Size use Header_End + 1;

   use type System.Storage_Elements.Storage_Offset;

   Header_Size : constant Pool_Offset := Header'Size / System.Storage_Unit;

   Chunk_Start_Index : constant := Chunk_Index_Start / System.Storage_Unit + 1;
   Refcount_Start_Index : constant := Refcount_Start / System.Storage_Unit + 1;
   First_Start_Index : constant := First_Start / System.Storage_Unit + 1;
   Last_Start_Index  : constant := Last_Start / System.Storage_Unit + 1;
   End_Index         : constant := (Header_End + 1) / System.Storage_Unit;

   type Accessor (Data : not null access constant UTF_8_String) is limited
      record
         --  holds a copy of the smart pointer, so the string cannot be freed
         --  while the accessor lives.
         Hold : Reference;
      end record;

   Empty : constant Reference := Ada.Strings.Unbounded.Null_Unbounded_String;

   --  it is important that all allocated strings are aligned to the header
   --  length. else, it may happen that we generate a region of free memory that
   --  is not large enough to hold a header – but we need to write the header
   --  there to hold information for the ring list. therefore, whenever we
   --  calculate offsets, we use this to round them up to a multiple of
   --  Header_Size.
   function Round_To_Header_Size (Length : Pool_Offset)
                                  return Pool_Offset is
     ((Length + Header_Size - 1) / Header_Size * Header_Size);

end Text;