note description: "UTF-8 encoding routines" library: "Gobo Eiffel Kernel Library" copyright: "Copyright (c) 2001-2018, Eric Bezault and others" license: "MIT License" date: "$Date: 2019-02-07 22:54:15 +0000 (Thu, 07 Feb 2019) $" revision: "$Revision: 102807 $" class UC_UTF8_ROUTINES create default_create feature {NONE} -- Initialization default_create -- Process instances of classes with no creation clause. -- (Default: do nothing.) -- (from ANY) do end feature -- Access Any_: KL_ANY_ROUTINES -- Routines that ought to be in class ANY -- (from KL_IMPORTED_ANY_ROUTINES) once create Result ensure -- from KL_IMPORTED_ANY_ROUTINES instance_free: class any_routines_not_void: Result /= Void end encoded_first_value (a_byte: CHARACTER_8): INTEGER_32 -- Value encoded in first byte require is_encoded_first_byte: is_encoded_first_byte (a_byte) do Result := a_byte.code if a_byte <= Byte_127 then elseif a_byte <= Byte_223 then Result := Result \\ 32 elseif a_byte <= Byte_239 then Result := Result \\ 16 elseif a_byte <= Byte_244 then Result := Result \\ 8 end ensure instance_free: class value_positive: Result >= 0 value_small_enough: Result < 128 end encoded_next_value (a_byte: CHARACTER_8): INTEGER_32 -- Value encoded in one of the next bytes require is_encoded_next_byte: is_encoded_next_byte (a_byte) do Result := a_byte.code \\ 64 ensure instance_free: class value_positive: Result >= 0 value_small_enough: Result < 64 end generating_type: TYPE [detachable UC_UTF8_ROUTINES] -- Type of current object -- (type of which it is a direct instance) -- (from ANY) external "built_in" ensure -- from ANY generating_type_not_void: Result /= Void end generator: STRING_8 -- Name of current object's generating class -- (base class of the type of which it is a direct instance) -- (from ANY) external "built_in" ensure -- from ANY generator_not_void: Result /= Void generator_not_empty: not Result.is_empty end Integer_: KL_INTEGER_ROUTINES -- Routines that ought to be in class INTEGER -- (from KL_IMPORTED_INTEGER_ROUTINES) once create Result ensure -- from KL_IMPORTED_INTEGER_ROUTINES instance_free: class integer_routines_not_void: Result /= Void end String_: KL_STRING_ROUTINES -- Routines that ought to be in class STRING -- (from KL_IMPORTED_STRING_ROUTINES) once create Result ensure -- from KL_IMPORTED_STRING_ROUTINES instance_free: class string_routines_not_void: Result /= Void end Unicode: UC_UNICODE_ROUTINES -- Unicode routines -- (from UC_IMPORTED_UNICODE_ROUTINES) once create Result ensure -- from UC_IMPORTED_UNICODE_ROUTINES instance_free: class unicode_not_void: Result /= Void end feature -- Measurement character_byte_count (c: CHARACTER_8): INTEGER_32 -- Number of bytes needed to encode character -- c with the UTF-8 encoding local a_code: INTEGER_32 do if c <= Byte_127 then Result := 1 elseif c <= Byte_255 then Result := 2 else a_code := c.code if a_code < 2048 then Result := 2 elseif a_code < 65536 then Result := 3 else Result := 4 end end ensure instance_free: class character_byte_count_large_enough: Result >= 1 character_byte_count_small_enough: Result <= 4 end code_byte_count (a_code: INTEGER_32): INTEGER_32 -- Number of bytes needed to encode unicode character -- of code a_code with the UTF-8 encoding require valid_code: Unicode.valid_non_surrogate_code (a_code) do if a_code < 128 then Result := 1 elseif a_code < 2048 then Result := 2 elseif a_code < 65536 then Result := 3 else Result := 4 end ensure instance_free: class code_byte_count_large_enough: Result >= 1 code_byte_count_small_enough: Result <= 4 end encoded_byte_count (a_byte: CHARACTER_8): INTEGER_32 -- Number of bytes which were necessary to encode -- the unicode character whose first byte is a_byte require is_encoded_first_byte: is_encoded_first_byte (a_byte) do if a_byte <= Byte_127 then Result := 1 elseif a_byte <= Byte_223 then Result := 2 elseif a_byte <= Byte_239 then Result := 3 else Result := 4 end ensure instance_free: class encoded_byte_code_large_enough: Result >= 1 encoded_byte_code_small_enough: Result <= 4 end substring_byte_count (a_string: READABLE_STRING_GENERAL; start_index, end_index: INTEGER_32): INTEGER_32 -- Number of bytes needed to encode characters of -- a_string between start_index and end_index -- inclusive with the UTF-8 encoding require a_string_not_void: a_string /= Void valid_start_index: 1 <= start_index valid_end_index: end_index <= a_string.count meaningful_interval: start_index <= end_index + 1 local s, e: INTEGER_32 i: INTEGER_32 even_end_index: INTEGER_32 c: CHARACTER_8 do if start_index <= end_index then if Any_.same_types (a_string, Dummy_string) and then attached {STRING_8} a_string as l_string_8 then if end_index \\ 2 = 0 then even_end_index := end_index else even_end_index := end_index - 1 end from i := start_index until i > even_end_index loop c := l_string_8.item (i) if c <= Byte_127 then Result := Result + 1 else Result := Result + character_byte_count (c) end c := l_string_8.item (i + 1) if c <= Byte_127 then Result := Result + 1 else Result := Result + character_byte_count (c) end i := i + 2 end if even_end_index < end_index then Result := Result + character_byte_count (l_string_8.item (end_index)) end elseif Any_.same_types (a_string, Dummy_uc_string) and then attached {UC_STRING} a_string as a_uc_string then if start_index = 1 and end_index = a_uc_string.count then Result := a_uc_string.byte_count else s := a_uc_string.byte_index (start_index) if end_index = a_uc_string.count then Result := a_uc_string.byte_count - s + 1 else e := a_uc_string.shifted_byte_index (s, end_index - start_index + 1) Result := e - s end end elseif attached {UC_UTF8_STRING} a_string as a_utf8 then if start_index = 1 and end_index = a_utf8.count then Result := a_utf8.byte_count else s := a_utf8.byte_index (start_index) if end_index = a_utf8.count then Result := a_utf8.byte_count - s + 1 else e := a_utf8.shifted_byte_index (s, end_index - start_index + 1) Result := e - s end end else from i := start_index until i > end_index loop Result := Result + code_byte_count (a_string.code (i).to_integer_32) i := i + 1 end end end ensure instance_free: class substring_byte_count_positive: Result >= 0 end feature -- Comparison frozen deep_equal (a: detachable ANY; b: like arg #1): BOOLEAN -- Are a and b either both void -- or attached to isomorphic object structures? -- (from ANY) do if a = Void then Result := b = Void else Result := b /= Void and then a.is_deep_equal (b) end ensure -- from ANY instance_free: class shallow_implies_deep: standard_equal (a, b) implies Result both_or_none_void: (a = Void) implies (Result = (b = Void)) same_type: (Result and (a /= Void)) implies (b /= Void and then a.same_type (b)) symmetric: Result implies deep_equal (b, a) end frozen equal (a: detachable ANY; b: like arg #1): BOOLEAN -- Are a and b either both void or attached -- to objects considered equal? -- (from ANY) do if a = Void then Result := b = Void else Result := b /= Void and then a.is_equal (b) end ensure -- from ANY instance_free: class definition: Result = (a = Void and b = Void) or else ((a /= Void and b /= Void) and then a.is_equal (b)) end frozen is_deep_equal (other: UC_UTF8_ROUTINES): BOOLEAN -- Are Current and other attached to isomorphic object structures? -- (from ANY) require -- from ANY other_not_void: other /= Void external "built_in" ensure -- from ANY shallow_implies_deep: standard_is_equal (other) implies Result same_type: Result implies same_type (other) symmetric: Result implies other.is_deep_equal (Current) end is_equal (other: UC_UTF8_ROUTINES): BOOLEAN -- Is other attached to an object considered -- equal to current object? -- (from ANY) require -- from ANY other_not_void: other /= Void external "built_in" ensure -- from ANY symmetric: Result implies other ~ Current consistent: standard_is_equal (other) implies Result end frozen standard_equal (a: detachable ANY; b: like arg #1): BOOLEAN -- Are a and b either both void or attached to -- field-by-field identical objects of the same type? -- Always uses default object comparison criterion. -- (from ANY) do if a = Void then Result := b = Void else Result := b /= Void and then a.standard_is_equal (b) end ensure -- from ANY instance_free: class definition: Result = (a = Void and b = Void) or else ((a /= Void and b /= Void) and then a.standard_is_equal (b)) end frozen standard_is_equal (other: UC_UTF8_ROUTINES): BOOLEAN -- Is other attached to an object of the same type -- as current object, and field-by-field identical to it? -- (from ANY) require -- from ANY other_not_void: other /= Void external "built_in" ensure -- from ANY same_type: Result implies same_type (other) symmetric: Result implies other.standard_is_equal (Current) end feature -- Status report conforms_to (other: ANY): BOOLEAN -- Does type of current object conform to type -- of other (as per Eiffel: The Language, chapter 13)? -- (from ANY) require -- from ANY other_not_void: other /= Void external "built_in" end is_encoded_first_byte (a_byte: CHARACTER_8): BOOLEAN -- Is a_byte the first byte in UTF-8 encoding? do Result := (a_byte <= Byte_127 or (Byte_194 <= a_byte and a_byte <= Byte_244)) ensure instance_free: class end is_encoded_next_byte (a_byte: CHARACTER_8): BOOLEAN -- Is a_byte one of the next bytes in UTF-8 encoding? do Result := (Byte_127 < a_byte and a_byte <= Byte_191) ensure instance_free: class end is_encoded_second_byte (a_byte, a_first_byte: CHARACTER_8): BOOLEAN -- Is a_byte a valid second byte in UTF-8 encoding? require valid_first_byte: is_encoded_first_byte (a_first_byte) do if a_first_byte = Byte_224 then Result := (Byte_159 < a_byte and a_byte <= Byte_191) elseif a_first_byte = Byte_237 then Result := (Byte_127 < a_byte and a_byte <= Byte_159) elseif a_first_byte = Byte_240 then Result := (Byte_143 < a_byte and a_byte <= Byte_191) elseif a_first_byte = Byte_244 then Result := (Byte_127 < a_byte and a_byte <= Byte_143) else Result := (Byte_127 < a_byte and a_byte <= Byte_191) end ensure instance_free: class end is_endian_detection_character (a_first, a_second, a_third: CHARACTER_8): BOOLEAN -- Is this sequence a UTF-8 Byte Order Marker (BOM)? do Result := is_endian_detection_character_start (a_first, a_second) and a_third = Byte_bf ensure instance_free: class result_start: Result implies is_endian_detection_character_start (a_first, a_second) end is_endian_detection_character_start (a_first, a_second: CHARACTER_8): BOOLEAN -- Are these characters the start of a UTF-8 encoded Byte Order Marker (BOM)? do Result := a_first = Byte_ef and a_second = Byte_bb ensure instance_free: class end same_type (other: ANY): BOOLEAN -- Is type of current object identical to type of other? -- (from ANY) require -- from ANY other_not_void: other /= Void external "built_in" ensure -- from ANY definition: Result = (conforms_to (other) and other.conforms_to (Current)) end valid_utf8 (a_string: STRING_8): BOOLEAN -- Are the bytes in a_string a valid UTF-8 encoding? require a_string_not_void: a_string /= Void a_string_is_string: Any_.same_types (a_string, "") local i, nb, nb2: INTEGER_32 bc, a_code: INTEGER_32 a_byte, a_first_byte: CHARACTER_8 do Result := True nb := a_string.count from i := 1 until i > nb loop a_first_byte := a_string.item (i) if is_encoded_first_byte (a_first_byte) then bc := encoded_byte_count (a_first_byte) if bc = 1 then i := i + 1 else nb2 := i + bc - 1 if nb2 > nb then Result := False i := nb + 1 else a_code := encoded_first_value (a_first_byte) i := i + 1 a_byte := a_string.item (i) if not is_encoded_second_byte (a_byte, a_first_byte) then Result := False i := nb + 1 else a_code := a_code * 64 + encoded_next_value (a_byte) inspect bc when 2 then if a_code <= Code_127 then Result := False end when 3 then if a_code <= Code_31 then Result := False end when 4 then if a_code <= Code_15 then Result := False end end if Result then from i := i + 1 until i > nb2 loop if is_encoded_next_byte (a_string.item (i)) then i := i + 1 else Result := False i := nb + 1 end end end end end end else Result := False i := nb + 1 end end ensure instance_free: class end feature -- Element change append_code_to_utf8 (a_utf8: STRING_8; a_code: INTEGER_32) -- Add UTF-8 encoded character of code a_code -- at the end of a_utf8. require a_utf8_not_void: a_utf8 /= Void a_utf8_is_string: Any_.same_types (a_utf8, "") a_utf8_valid: valid_utf8 (a_utf8) valid_code: Unicode.valid_non_surrogate_code (a_code) local b2, b3, b4: CHARACTER_8 c: INTEGER_32 do inspect code_byte_count (a_code) when 1 then a_utf8.append_character (Integer_.to_character (a_code)) when 2 then c := a_code b2 := Integer_.to_character ((c \\ 64) + 128) c := c // 64 a_utf8.append_character (Integer_.to_character (c + 192)) a_utf8.append_character (b2) when 3 then c := a_code b3 := Integer_.to_character ((c \\ 64) + 128) c := c // 64 b2 := Integer_.to_character ((c \\ 64) + 128) c := c // 64 a_utf8.append_character (Integer_.to_character (c + 224)) a_utf8.append_character (b2) a_utf8.append_character (b3) when 4 then c := a_code b4 := Integer_.to_character ((c \\ 64) + 128) c := c // 64 b3 := Integer_.to_character ((c \\ 64) + 128) c := c // 64 b2 := Integer_.to_character ((c \\ 64) + 128) c := c // 64 a_utf8.append_character (Integer_.to_character (c + 240)) a_utf8.append_character (b2) a_utf8.append_character (b3) a_utf8.append_character (b4) end ensure instance_free: class a_utf8_valid: valid_utf8 (a_utf8) end feature -- Conversion to_utf8 (a_string: STRING_8): STRING_8 -- New STRING made up of bytes corresponding to -- the UTF-8 representation of a_string require a_string_not_void: a_string /= Void local i, nb: INTEGER_32 do if attached {UC_STRING} a_string as uc_string then Result := uc_string.to_utf8 else nb := a_string.count create Result.make (nb) from i := 1 until i > nb loop append_code_to_utf8 (Result, a_string.item_code (i)) i := i + 1 end end ensure instance_free: class to_utf8_not_void: Result /= Void string_type: Any_.same_types (Result, "") valid_utf8: valid_utf8 (Result) end feature -- Duplication frozen clone (other: detachable ANY): like other obsolete "Use `twin' instead. [2017-05-31]" -- Void if other is void; otherwise new object -- equal to other -- -- For non-void other, clone calls copy; -- to change copying/cloning semantics, redefine copy. -- (from ANY) do if other /= Void then Result := other.twin end ensure -- from ANY instance_free: class equal: Result ~ other end copy (other: UC_UTF8_ROUTINES) -- Update current object using fields of object attached -- to other, so as to yield equal objects. -- (from ANY) require -- from ANY other_not_void: other /= Void type_identity: same_type (other) external "built_in" ensure -- from ANY is_equal: Current ~ other end frozen deep_clone (other: detachable ANY): like other obsolete "Use `deep_twin' instead. [2017-05-31]" -- Void if other is void: otherwise, new object structure -- recursively duplicated from the one attached to other -- (from ANY) do if other /= Void then Result := other.deep_twin end ensure -- from ANY instance_free: class deep_equal: deep_equal (other, Result) end frozen deep_copy (other: UC_UTF8_ROUTINES) -- Effect equivalent to that of: -- copy (other . deep_twin) -- (from ANY) require -- from ANY other_not_void: other /= Void do copy (other.deep_twin) ensure -- from ANY deep_equal: deep_equal (Current, other) end frozen deep_twin: UC_UTF8_ROUTINES -- New object structure recursively duplicated from Current. -- (from ANY) external "built_in" ensure -- from ANY deep_twin_not_void: Result /= Void deep_equal: deep_equal (Current, Result) end frozen standard_clone (other: detachable ANY): like other obsolete "Use `standard_twin' instead. [2017-05-31]" -- Void if other is void; otherwise new object -- field-by-field identical to other. -- Always uses default copying semantics. -- (from ANY) do if other /= Void then Result := other.standard_twin end ensure -- from ANY instance_free: class equal: standard_equal (Result, other) end frozen standard_copy (other: UC_UTF8_ROUTINES) -- Copy every field of other onto corresponding field -- of current object. -- (from ANY) require -- from ANY other_not_void: other /= Void type_identity: same_type (other) external "built_in" ensure -- from ANY is_standard_equal: standard_is_equal (other) end frozen standard_twin: UC_UTF8_ROUTINES -- New object field-by-field identical to other. -- Always uses default copying semantics. -- (from ANY) external "built_in" ensure -- from ANY standard_twin_not_void: Result /= Void equal: standard_equal (Result, Current) end frozen twin: UC_UTF8_ROUTINES -- New object equal to Current -- twin calls copy; to change copying/twinning semantics, redefine copy. -- (from ANY) external "built_in" ensure -- from ANY twin_not_void: Result /= Void is_equal: Result ~ Current end feature -- Basic operations frozen as_attached: attached UC_UTF8_ROUTINES obsolete "Remove calls to this feature. [2017-05-31]" -- Attached version of Current. -- (Can be used during transitional period to convert -- non-void-safe classes to void-safe ones.) -- (from ANY) do Result := Current end frozen default: detachable UC_UTF8_ROUTINES -- Default value of object's type -- (from ANY) do end frozen default_pointer: POINTER -- Default value of type POINTER -- (Avoid the need to write p.default for -- some p of type POINTER.) -- (from ANY) do ensure -- from ANY instance_free: class end default_rescue -- Process exception for routines with no Rescue clause. -- (Default: do nothing.) -- (from ANY) do end frozen do_nothing -- Execute a null action. -- (from ANY) do ensure -- from ANY instance_free: class end feature {NONE} -- Implementation Dummy_string: STRING_8 = "" -- Dummy string Dummy_uc_string: UC_STRING -- Dummy UC_STRING once create Result.make_empty ensure instance_free: class dummy_uc_string_not_void: Result /= Void end feature {NONE} -- Constants Byte_127: CHARACTER_8 = '%/127/' -- Highest ASCII character/1st UTF-8 byte Byte_143: CHARACTER_8 = '%/143/' Byte_159: CHARACTER_8 = '%/159/' Byte_191: CHARACTER_8 = '¿' -- 10111111 Byte_194: CHARACTER_8 = 'Â' Byte_223: CHARACTER_8 = 'ß' -- 11011111 Byte_224: CHARACTER_8 = 'à' Byte_237: CHARACTER_8 = 'í' Byte_239: CHARACTER_8 = 'ï' -- 11101111 Byte_240: CHARACTER_8 = 'ð' Byte_244: CHARACTER_8 = 'ô' Byte_247: CHARACTER_8 = '÷' -- 11110111 Byte_251: CHARACTER_8 = 'û' -- 11111011 Byte_253: CHARACTER_8 = 'ý' -- 11111101 Byte_255: CHARACTER_8 = 'ÿ' Byte_bb: CHARACTER_8 = '»' -- UTF-8 BOM second: BB Byte_bf: CHARACTER_8 = '¿' -- UTF-8 BOM third: BF Byte_ef: CHARACTER_8 = 'ï' -- UTF-8 BOM first: EF Code_127: INTEGER_32 = 127 -- 01111111 Code_15: INTEGER_32 = 15 Code_191: INTEGER_32 = 191 -- 10111111 Code_223: INTEGER_32 = 223 -- 11011111 Code_239: INTEGER_32 = 239 -- 11101111 Code_247: INTEGER_32 = 247 -- 11110111 Code_251: INTEGER_32 = 251 -- 11111011 Code_253: INTEGER_32 = 253 -- 11111101 Code_3: INTEGER_32 = 3 -- 111110xx (2^2 - 1 = 3) Code_31: INTEGER_32 = 31 Code_7: INTEGER_32 = 7 -- 11110xxx (2^3 - 1 = 7) feature -- Output Io: STD_FILES -- Handle to standard file setup -- (from ANY) once create Result Result.set_output_default ensure -- from ANY instance_free: class io_not_void: Result /= Void end out: STRING_8 -- New string containing terse printable representation -- of current object -- (from ANY) do Result := tagged_out ensure -- from ANY out_not_void: Result /= Void end print (o: detachable ANY) -- Write terse external representation of o -- on standard output. -- (from ANY) do if o /= Void then Io.put_string (o.out) end ensure -- from ANY instance_free: class end frozen tagged_out: STRING_8 -- New string containing terse printable representation -- of current object -- (from ANY) external "built_in" ensure -- from ANY tagged_out_not_void: Result /= Void end feature -- Platform Operating_environment: OPERATING_ENVIRONMENT -- Objects available from the operating system -- (from ANY) once create Result ensure -- from ANY instance_free: class operating_environment_not_void: Result /= Void end feature {NONE} -- Retrieval frozen internal_correct_mismatch -- Called from runtime to perform a proper dynamic dispatch on correct_mismatch -- from MISMATCH_CORRECTOR. -- (from ANY) local l_msg: STRING_8 l_exc: EXCEPTIONS do if attached {MISMATCH_CORRECTOR} Current as l_corrector then l_corrector.correct_mismatch else create l_msg.make_from_string ("Mismatch: ") create l_exc l_msg.append (generating_type.name) l_exc.raise_retrieval_exception (l_msg) end end invariant -- from ANY reflexive_equality: standard_is_equal (Current) reflexive_conformance: conforms_to (Current) end -- class UC_UTF8_ROUTINES
Generated by ISE EiffelStudio