note description: "UTF-16 encoding routines" library: "Gobo Eiffel Kernel Library" copyright: "Copyright (c) 2002-2018, Eric Bezault and others" license: "MIT License" date: "$Date: 2019-02-07 22:54:15 +0000 (Thu, 07 Feb 2019) $" revision: "$Revision: 102807 $" class interface UC_UTF16_ROUTINES create default_create -- Process instances of classes with no creation clause. -- (Default: do nothing.) -- (from ANY) feature -- Access Any_: KL_ANY_ROUTINES -- Routines that ought to be in class ANY -- (from KL_IMPORTED_ANY_ROUTINES) ensure -- from KL_IMPORTED_ANY_ROUTINES instance_free: class any_routines_not_void: Result /= Void Canonical_decomposition_mapping: INTEGER_32 = 0 -- Decomposition mapping is canonical -- (from UC_UNICODE_CONSTANTS) Close_punctuation_category: INTEGER_32 = 15 -- Close punctuation -- (from UC_UNICODE_CONSTANTS) Compatibility_decomposition_mapping: INTEGER_32 = 16 -- Decomposition mapping for unspecified compatibility character -- (from UC_UNICODE_CONSTANTS) Connector_punctuation_category: INTEGER_32 = 12 -- Connector punctuation -- (from UC_UNICODE_CONSTANTS) Control_other_category: INTEGER_32 = 26 -- Control character -- (from UC_UNICODE_CONSTANTS) Currency_symbol_category: INTEGER_32 = 20 -- Currency symbol -- (from UC_UNICODE_CONSTANTS) Dash_punctuation_category: INTEGER_32 = 13 -- Dash punctuation -- (from UC_UNICODE_CONSTANTS) Decimal_digit_number_category: INTEGER_32 = 9 -- Decimal digit number -- (from UC_UNICODE_CONSTANTS) Encircled_decomposition_mapping: INTEGER_32 = 7 -- Decomposition mapping for encircled form -- (from UC_UNICODE_CONSTANTS) Enclosing_mark_category: INTEGER_32 = 8 -- Enclosing mark -- (from UC_UNICODE_CONSTANTS) Final_decomposition_mapping: INTEGER_32 = 5 -- Decomposition mapping for Arabic final presentation form -- (from UC_UNICODE_CONSTANTS) Final_quote_punctuation_category: INTEGER_32 = 17 -- Final_quote punctuation -- (from UC_UNICODE_CONSTANTS) Font_decomposition_mapping: INTEGER_32 = 1 -- Decomposition mapping for font variant -- (from UC_UNICODE_CONSTANTS) Format_other_category: INTEGER_32 = 27 -- Format character -- (from UC_UNICODE_CONSTANTS) Fraction_decomposition_mapping: INTEGER_32 = 15 -- Decomposition mapping for vulgar fraction form -- (from UC_UNICODE_CONSTANTS) generating_type: TYPE [detachable UC_UTF16_ROUTINES] -- Type of current object -- (type of which it is a direct instance) -- (from ANY) ensure -- from ANY generating_type_not_void: Result /= Void generator: STRING_8 -- Name of current object's generating class -- (base class of the type of which it is a direct instance) -- (from ANY) ensure -- from ANY generator_not_void: Result /= Void generator_not_empty: not Result.is_empty Initial_decomposition_mapping: INTEGER_32 = 3 -- Decomposition mapping for Arabic initial presentation form -- (from UC_UNICODE_CONSTANTS) Initial_quote_punctuation_category: INTEGER_32 = 16 -- Initial punctuation -- (from UC_UNICODE_CONSTANTS) Integer_: KL_INTEGER_ROUTINES -- Routines that ought to be in class INTEGER -- (from KL_IMPORTED_INTEGER_ROUTINES) ensure -- from KL_IMPORTED_INTEGER_ROUTINES instance_free: class integer_routines_not_void: Result /= Void Isolated_decomposition_mapping: INTEGER_32 = 6 -- Decomposition mapping for Arabic isolated presentation form -- (from UC_UNICODE_CONSTANTS) Letter_number_category: INTEGER_32 = 10 -- Letter number -- (from UC_UNICODE_CONSTANTS) Line_separator_category: INTEGER_32 = 24 -- Line separator -- (from UC_UNICODE_CONSTANTS) Lowercase_letter_category: INTEGER_32 = 2 -- Lower case letter -- (from UC_UNICODE_CONSTANTS) Math_symbol_category: INTEGER_32 = 19 -- Mathematics symbol -- (from UC_UNICODE_CONSTANTS) Maximum_ascii_character: CHARACTER_8 = '%/127/' -- Largest ASCII character -- ensure -- definition: Result.code = maximum_ascii_code -- end -- (from UC_UNICODE_CONSTANTS) Maximum_ascii_character_code: INTEGER_32 = 127 -- Largest code for ASCII characters -- (2^7 - 1) -- ensure -- definition: Result = 127 -- small_enough: Result <= Platform.Maximum_byte_code -- end -- (from UC_UNICODE_CONSTANTS) Maximum_bmp_character_code: INTEGER_32 = 65535 -- Largest code for unicode characters in Basic Multi-lingual Plane (FFFF); -- ensure -- definition: Result = 65535 -- end -- (from UC_UNICODE_CONSTANTS) Maximum_unicode_character_code: INTEGER_32 = 1114111 -- Largest code for unicode characters (10FFFF); -- Includes final two non-characters. -- ensure -- definition: Result = 1114111 -- end -- (from UC_UNICODE_CONSTANTS) Maximum_unicode_surrogate_code: INTEGER_32 = 57343 -- Highest unicode surrogate code-point (0xDFFF) -- ensure -- definition: Result = 57343 -- end -- (from UC_UNICODE_CONSTANTS) Medial_decomposition_mapping: INTEGER_32 = 4 -- Decomposition mapping for Arabic medial presentation form -- (from UC_UNICODE_CONSTANTS) Minimum_ascii_character: CHARACTER_8 = '%U' -- Smallest ASCII character -- ensure -- definition: Result.code = minimum_ascii_code -- end -- (from UC_UNICODE_CONSTANTS) Minimum_ascii_character_code: INTEGER_32 = 0 -- Smallest code for ASCII characters -- ensure -- definition: Result = 0 -- end -- (from UC_UNICODE_CONSTANTS) Minimum_unicode_character_code: INTEGER_32 = 0 -- Smallest code for unicode characters -- ensure -- definition: Result = 0 -- end -- (from UC_UNICODE_CONSTANTS) Minimum_unicode_surrogate_code: INTEGER_32 = 55296 -- Lowest unicode surrogate code-point (0xD800) -- ensure -- definition: Result = 55296 -- end -- (from UC_UNICODE_CONSTANTS) Modifier_letter_category: INTEGER_32 = 4 -- Modifier letter -- (from UC_UNICODE_CONSTANTS) Modifier_symbol_category: INTEGER_32 = 21 -- Modifier symbol -- (from UC_UNICODE_CONSTANTS) Narrow_decomposition_mapping: INTEGER_32 = 12 -- Decomposition mapping for narrow (hankaku) compatibility character -- (from UC_UNICODE_CONSTANTS) No_break_decomposition_mapping: INTEGER_32 = 2 -- Decomposition mapping for no-break variant -- (from UC_UNICODE_CONSTANTS) Non_spacing_mark_category: INTEGER_32 = 6 -- Non-spacing mark -- (from UC_UNICODE_CONSTANTS) Open_punctuation_category: INTEGER_32 = 14 -- Open punctuation -- (from UC_UNICODE_CONSTANTS) Other_letter_category: INTEGER_32 = 5 -- Other letter -- (from UC_UNICODE_CONSTANTS) Other_number_category: INTEGER_32 = 11 -- Other number -- (from UC_UNICODE_CONSTANTS) Other_punctuation_category: INTEGER_32 = 18 -- Other punctuation -- (from UC_UNICODE_CONSTANTS) Other_symbol_category: INTEGER_32 = 22 -- Other symbol -- (from UC_UNICODE_CONSTANTS) Paragraph_separator_category: INTEGER_32 = 25 -- Paragraph separator -- (from UC_UNICODE_CONSTANTS) Platform: KL_PLATFORM -- Platform-dependent properties -- (from KL_SHARED_PLATFORM) ensure -- from KL_SHARED_PLATFORM instance_free: class platform_not_void: Result /= Void Private_other_category: INTEGER_32 = 29 -- Private-use character -- (from UC_UNICODE_CONSTANTS) Small_decomposition_mapping: INTEGER_32 = 13 -- Decomposition mapping for small variant form (CNS compatibility) -- (from UC_UNICODE_CONSTANTS) Space_separator_category: INTEGER_32 = 23 -- Space separator -- (from UC_UNICODE_CONSTANTS) Spacing_combining_mark_category: INTEGER_32 = 7 -- Spacing combining mark -- (from UC_UNICODE_CONSTANTS) Square_decomposition_mapping: INTEGER_32 = 14 -- Decomposition mapping for CJK squared font variant -- (from UC_UNICODE_CONSTANTS) Subscript_decomposition_mapping: INTEGER_32 = 9 -- Decomposition mapping for subscript form -- (from UC_UNICODE_CONSTANTS) Superscript_decomposition_mapping: INTEGER_32 = 8 -- Decomposition mapping for superscript form -- (from UC_UNICODE_CONSTANTS) Surrogate_other_category: INTEGER_32 = 28 -- Surrogate character -- (from UC_UNICODE_CONSTANTS) Titlecase_letter_category: INTEGER_32 = 3 -- Title case letter -- (from UC_UNICODE_CONSTANTS) Unassigned_other_category: INTEGER_32 = 0 -- Unassigned chacaracter -- (from UC_UNICODE_CONSTANTS) Uppercase_letter_category: INTEGER_32 = 1 -- Upper case letter -- (from UC_UNICODE_CONSTANTS) Vertical_decomposition_mapping: INTEGER_32 = 10 -- Decomposition mapping for vertical layout presentation form -- (from UC_UNICODE_CONSTANTS) Wide_decomposition_mapping: INTEGER_32 = 11 -- Decomposition mapping for wide (zenkaku) compatibility character -- (from UC_UNICODE_CONSTANTS) feature -- Comparison frozen deep_equal (a: detachable ANY; b: like arg #1): BOOLEAN -- Are a and b either both void -- or attached to isomorphic object structures? -- (from ANY) ensure -- from ANY instance_free: class shallow_implies_deep: standard_equal (a, b) implies Result both_or_none_void: (a = Void) implies (Result = (b = Void)) same_type: (Result and (a /= Void)) implies (b /= Void and then a.same_type (b)) symmetric: Result implies deep_equal (b, a) frozen equal (a: detachable ANY; b: like arg #1): BOOLEAN -- Are a and b either both void or attached -- to objects considered equal? -- (from ANY) ensure -- from ANY instance_free: class definition: Result = (a = Void and b = Void) or else ((a /= Void and b /= Void) and then a.is_equal (b)) frozen is_deep_equal (other: UC_UTF16_ROUTINES): BOOLEAN -- Are Current and other attached to isomorphic object structures? -- (from ANY) require -- from ANY other_not_void: other /= Void ensure -- from ANY shallow_implies_deep: standard_is_equal (other) implies Result same_type: Result implies same_type (other) symmetric: Result implies other.is_deep_equal (Current) is_equal (other: UC_UTF16_ROUTINES): BOOLEAN -- Is other attached to an object considered -- equal to current object? -- (from ANY) require -- from ANY other_not_void: other /= Void ensure -- from ANY symmetric: Result implies other ~ Current consistent: standard_is_equal (other) implies Result frozen standard_equal (a: detachable ANY; b: like arg #1): BOOLEAN -- Are a and b either both void or attached to -- field-by-field identical objects of the same type? -- Always uses default object comparison criterion. -- (from ANY) ensure -- from ANY instance_free: class definition: Result = (a = Void and b = Void) or else ((a /= Void and b /= Void) and then a.standard_is_equal (b)) frozen standard_is_equal (other: UC_UTF16_ROUTINES): BOOLEAN -- Is other attached to an object of the same type -- as current object, and field-by-field identical to it? -- (from ANY) require -- from ANY other_not_void: other /= Void ensure -- from ANY same_type: Result implies same_type (other) symmetric: Result implies other.standard_is_equal (Current) feature -- Status report conforms_to (other: ANY): BOOLEAN -- Does type of current object conform to type -- of other (as per Eiffel: The Language, chapter 13)? -- (from ANY) require -- from ANY other_not_void: other /= Void same_type (other: ANY): BOOLEAN -- Is type of current object identical to type of other? -- (from ANY) require -- from ANY other_not_void: other /= Void ensure -- from ANY definition: Result = (conforms_to (other) and other.conforms_to (Current)) valid_utf16 (a_string: STRING_8): BOOLEAN -- Are the bytes in a_string a valid UTF-16 encoding? -- 'a_string' has one byte per character. -- Default to big endian when no BOM. require a_string_not_void: a_string /= Void a_string_is_string: Any_.same_types (a_string, "") ensure instance_free: class empty_is_true: a_string.count = 0 implies Result utf16_even_count: Result implies ((a_string.count \\ 2) = 0) valid_utf16be (a_string: STRING_8): BOOLEAN -- Are the bytes in a_string valid UTF-16BE? -- 'a_string' has one byte per character. require a_string_not_void: a_string /= Void a_string_is_string: Any_.same_types (a_string, "") ensure instance_free: class empty_is_true: a_string.count = 0 implies Result utf16_even_count: Result implies ((a_string.count \\ 2) = 0) valid_utf16le (a_string: STRING_8): BOOLEAN -- Are the bytes in a_string valid UTF-16LE? -- 'a_string' has one byte per character. require a_string_not_void: a_string /= Void a_string_is_string: Any_.same_types (a_string, "") ensure instance_free: class empty_is_true: a_string.count = 0 implies Result utf16_even_count: Result implies ((a_string.count \\ 2) = 0) feature -- Duplication copy (other: UC_UTF16_ROUTINES) -- Update current object using fields of object attached -- to other, so as to yield equal objects. -- (from ANY) require -- from ANY other_not_void: other /= Void type_identity: same_type (other) ensure -- from ANY is_equal: Current ~ other frozen deep_copy (other: UC_UTF16_ROUTINES) -- Effect equivalent to that of: -- copy (other . deep_twin) -- (from ANY) require -- from ANY other_not_void: other /= Void ensure -- from ANY deep_equal: deep_equal (Current, other) frozen deep_twin: UC_UTF16_ROUTINES -- New object structure recursively duplicated from Current. -- (from ANY) ensure -- from ANY deep_twin_not_void: Result /= Void deep_equal: deep_equal (Current, Result) frozen standard_copy (other: UC_UTF16_ROUTINES) -- Copy every field of other onto corresponding field -- of current object. -- (from ANY) require -- from ANY other_not_void: other /= Void type_identity: same_type (other) ensure -- from ANY is_standard_equal: standard_is_equal (other) frozen standard_twin: UC_UTF16_ROUTINES -- New object field-by-field identical to other. -- Always uses default copying semantics. -- (from ANY) ensure -- from ANY standard_twin_not_void: Result /= Void equal: standard_equal (Result, Current) frozen twin: UC_UTF16_ROUTINES -- New object equal to Current -- twin calls copy; to change copying/twinning semantics, redefine copy. -- (from ANY) ensure -- from ANY twin_not_void: Result /= Void is_equal: Result ~ Current feature -- Basic operations frozen default: detachable UC_UTF16_ROUTINES -- Default value of object's type -- (from ANY) frozen default_pointer: POINTER -- Default value of type POINTER -- (Avoid the need to write p.default for -- some p of type POINTER.) -- (from ANY) ensure -- from ANY instance_free: class default_rescue -- Process exception for routines with no Rescue clause. -- (Default: do nothing.) -- (from ANY) frozen do_nothing -- Execute a null action. -- (from ANY) ensure -- from ANY instance_free: class feature -- Endian-ness detection Bom_be: STRING_8 -- BOM in big-endian format ensure instance_free: class bom_be_not_void: Result /= Void two_bytes: Result.count = 2 first_byte: Result.item_code (1) = Hex_fe second_byte: Result.item_code (2) = Hex_ff Bom_le: STRING_8 -- BOM in little-endian format ensure instance_free: class bom_le_not_void: Result /= Void two_bytes: Result.count = 2 first_byte: Result.item_code (1) = Hex_ff second_byte: Result.item_code (2) = Hex_fe is_endian_detection_character (a_byte, other_byte: INTEGER_32): BOOLEAN -- Can these two bytes represent ZERO WIDTH NON-BREAKING SPACE? -- (It has to be unicode character 0xFEFF, because 0xFFFE is not a valid character.) require a_byte_is_byte: is_byte (a_byte) other_byte_is_byte: is_byte (other_byte) ensure instance_free: class definition: Result = (a_byte.min (other_byte) = Hex_fe and a_byte.max (other_byte) = Hex_ff) is_endian_detection_character_least_first (first, second: INTEGER_32): BOOLEAN -- Do the two bytes first and second represent the character -- 0xFEFF with first being the least significant byte? require a_byte_is_byte: is_byte (first) other_byte_is_byte: is_byte (second) ensure instance_free: class definition: Result = (is_endian_detection_character (first, second) and (first = Hex_ff)) is_endian_detection_character_most_first (first, second: INTEGER_32): BOOLEAN -- Do the two bytes first and second represent the character -- 0xFEFF with first being the most significant byte? require a_byte_is_byte: is_byte (first) other_byte_is_byte: is_byte (second) ensure instance_free: class definition: Result = (is_endian_detection_character (first, second) and (first = Hex_fe)) feature -- Output Io: STD_FILES -- Handle to standard file setup -- (from ANY) ensure -- from ANY instance_free: class io_not_void: Result /= Void out: STRING_8 -- New string containing terse printable representation -- of current object -- (from ANY) ensure -- from ANY out_not_void: Result /= Void print (o: detachable ANY) -- Write terse external representation of o -- on standard output. -- (from ANY) ensure -- from ANY instance_free: class frozen tagged_out: STRING_8 -- New string containing terse printable representation -- of current object -- (from ANY) ensure -- from ANY tagged_out_not_void: Result /= Void feature -- Platform Operating_environment: OPERATING_ENVIRONMENT -- Objects available from the operating system -- (from ANY) ensure -- from ANY instance_free: class operating_environment_not_void: Result /= Void feature -- Surrogate is_byte (a: INTEGER_32): BOOLEAN -- Is a a byte? ensure instance_free: class definition: Result = (a >= 0 and a < Hex_100) is_high_surrogate (a_most: INTEGER_32): BOOLEAN -- Is this a high surrogate byte? require byte: is_byte (a_most) ensure instance_free: class is_low_surrogate (a_most: INTEGER_32): BOOLEAN -- Is this a low surrogate byte? require byte: is_byte (a_most) ensure instance_free: class is_surrogate (a_most: INTEGER_32): BOOLEAN -- Is this a high surrogate byte? require byte: is_byte (a_most) ensure instance_free: class least_10_bits (msb, lsb: INTEGER_32): INTEGER_32 -- UTF16 least 10 bytes of a byte pair require msb_byte: is_byte (msb) lsb_byte: is_byte (lsb) surrogate: is_surrogate (msb) ensure instance_free: class ten_bits: Result >= 0 and Result < Hex_400 supplementary_to_high_surrogate (a_code: INTEGER_32): INTEGER_32 -- High surrogate for a_code require code_high_enough: a_code > Maximum_bmp_character_code code_low_enough: a_code <= Maximum_unicode_character_code ensure instance_free: class high_surrogate: Result >= 256 * Hex_d8 not_too_big: Result < 256 * Hex_dc supplementary_to_low_surrogate (a_code: INTEGER_32): INTEGER_32 -- Low surrogate for a_code require code_high_enough: a_code > Maximum_bmp_character_code code_low_enough: a_code <= Maximum_unicode_character_code ensure instance_free: class low_surrogate: Result >= 256 * Hex_dc not_too_big: Result < 256 * Hex_e0 surrogate (a_high_10: INTEGER_32; a_low_10: INTEGER_32): INTEGER_32 -- Supplementary code point from high and low values require high_10: a_high_10 >= 0 and a_high_10 < 1024 low_10: a_low_10 >= 0 and a_low_10 < 1024 ensure instance_free: class more_than_16bits: Result >= Hex_10000 surrogate_from_bytes (a_high_most, a_high_least, a_low_most, a_low_least: INTEGER_32): INTEGER_32 -- Supplementary code point from bytes require surrogate_high: is_high_surrogate (a_high_most) high_least_byte: is_byte (a_high_least) surrogate_low: is_low_surrogate (a_low_most) low_least_byte: is_byte (a_low_least) ensure instance_free: class more_than_16bits: Result >= Hex_10000 invariant -- from ANY reflexive_equality: standard_is_equal (Current) reflexive_conformance: conforms_to (Current) end -- class UC_UTF16_ROUTINES
Generated by ISE EiffelStudio