Context Navigation

← Previous Change
Next Change →

RichTextDocumentUnit.pas

Timestamp:

Feb 25, 2019, 8:34:42 PM (6 years ago)

Author:

ataylor

Message:

Experimental new logic to try and fix DBCS text wrapping.

File:

: 1 edited

trunk/Components/RichTextDocumentUnit.pas (modified) (7 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/Components/RichTextDocumentUnit.pas

-              r405
+              r418
   end;
   TTextElementType = ( teText, // a character
+  TTextElementType = ( teText,              // a character
                        teWordBreak,
                        teLineBreak, // end of para
+                       teLineBreak,         // end of para
                        teTextEnd,
                        teImage,
+                       teStyle );
+                       teStyle,
+                       teWrapChar,          // A non-whitespace wrappable character (ALT)
+                       teLeadByte,          // DBCS lead byte (ALT)
+                       teSecondByte );      // DBCS secondary byte (ALT)
   TTextElement = record
 …
                                 Buffer: PChar;
                                 BufferLength: longint ): longint;
+// ALT begins
+//
+// Check for special text element types and adjust value accordingly.
+procedure CheckSpecialElementType( const Character:   Char;
+                                   var   ElementType: TTextElementType;
+                                   var   InsideDBC:   Boolean;
+                                   const Codepage:    LongInt );
+// Returns true if the given byte value is a legally-wrappable single-byte
+// character under the given Asian codepage.
+function IsAsianWrapChar( const CharByte: Byte;
+                          const Codepage: LongInt ): boolean;
+// Returns true if the given byte value is the leading byte of a multi-byte
+// character under the given Asian codepage.
+function IsDBCSLeadByte( const CharByte: Byte;
+                         const Codepage: LongInt ): boolean;
+// Returns true if the given byte value is valid as a possible second byte of
+// a multi-byte character (this does not guarantee that it IS one, just that
+// it COULD be).
+function IsDBCSSecondByte( const CharByte: Byte;
+                           const Codepage: LongInt ): boolean;
+// Adjusts the character position to the beginning of any multi-byte character.
+procedure MoveToCharacterBoundary(     TextPointer: PChar;
+                                   var Index:       LongInt;
+                                   var Offset:      LongInt;
+                                       RowStart:    LongInt;
+                                       Codepage:    LongInt );
+//
+// ALT ends
 Implementation
 …
 var
   TheChar: Char;
   NextChar: char;
+  NextChar: Char;
 begin
   with Result do
 …
     Character := TheChar;
     inc( TextPointer );
     case TheChar of
 …
       end;
+//    '-': // ---- Hyphen (ALT)
+//      ElementType := teWrapChar;
       else
         ElementType := teText;
     end;
   end; // with
   NextElement := TextPointer;
 end;
 …
         end;
       end
+//    '-': // ---- Hyphen (ALT)
+//      ElementType := teWrapChar;
       else
         ElementType := teText;
 …
 end;
+// ALT begins
+//
+// Check for special text element types that depend on context.
+//
+procedure CheckSpecialElementType( const Character:   Char;
+                                   var   ElementType: TTextElementType;
+                                   var   InsideDBC:   Boolean;
+                                   const Codepage:    LongInt );
+var
+  CharByte: Byte;
+begin
+  if Codepage in [ 874, 932, 936, 942, 943, 949, 950, 1381, 1386 ] then
+  begin
+    CharByte := ord( Character );
+    if InsideDBC then
+    begin
+        InsideDBC := false;
+        // sanity check for corrupt text sequence (definitely not foolproof)
+        if IsDBCSSecondByte( CharByte, Codepage ) then
+          ElementType := teSecondByte
+        else
+          ElementType := teText;
+    end
+    else
+    begin
+      if IsAsianWrapChar( CharByte, Codepage ) then
+      begin
+        ElementType := teWrapChar;
+        InsideDBC := false;
+      end
+      else if IsDBCSLeadByte( CharByte, Codepage ) then
+      begin
+        ElementType := teLeadByte;
+        InsideDBC := true;
+      end;
+    end;
+  end;
+end;
+// Check if this (single-byte) character is a legal wrap point under certain
+// Asian codepages. This is really only used for Thai and for Japanese
+// half-width katakana; other DBCS languages use double-byte characters for all
+// their native glyphs.
+//
+function IsAsianWrapChar( const CharByte: Byte;
+                          const Codepage: LongInt ): boolean;
+begin
+    Result := false;
+    if ( CharByte < $80) then
+      exit;
+    case Codepage of
+, 942, 943:        // Japanese
+        if CharByte in [ $A2, $A6, $B1..$DD ] then
+          Result := true;
+:                  // Thai
+        Result := true;
+    end;
+end;
+// Check if this is the lead byte of a double-byte character. This is essential
+// to know in certain cases:
+//  - Nothing must ever be inserted between such a byte and the next byte
+//    (e.g. line break, tag, etc).
+//  - Cursor position must never be set between such a byte and the next byte.
+//  - Selection state must never change between such a byte and the next byte.
+//
+function IsDBCSLeadByte( const CharByte: Byte;
+                         const Codepage: LongInt ): boolean;
+begin
+    Result := false;
+    case Codepage of
+, 942, 943:        // Japanese
+        if CharByte in [ $81..$9F, $E0..$FC ] then
+          Result := true;
+:                  // Korean KSC
+        if CharByte in [ $85..$FE ] then
+          Result := true;
+:                 // Chinese GB2312
+        if CharByte in [ $8C..$FE ] then
+          Result := true;
+, 950, 1386:       // Chinese BIG-5 or GBK
+        if CharByte in [ $81..$FE ] then
+          Result := true;
+    end;
+end;
+// Check to see if this byte is a valid second byte in a double-byte character.
+// (This doesn't guarantee that it IS such a byte, only that it COULD be. The
+// caller is assumed to know whether we're in a double byte character or not.)
+//
+function IsDBCSSecondByte( const CharByte: Byte;
+                           const Codepage: LongInt ): boolean;
+begin
+    Result := false;
+    case Codepage of
+, 936, 942, 943, 949, 950, 1386:
+        if CharByte >= $40 then
+          Result := true;
+:
+        if CharByte >= $A1 then
+          Result := true;
+    end;
+end;
+// Given a string position, check to see if it's in the middle of a double-byte
+// character; if so, move back by one position so that we're sitting immediately
+// in front of the double-byte character instead.
+//
+procedure MoveToCharacterBoundary(     TextPointer: PChar;
+                                   var Index:       LongInt;
+                                   var Offset:      LongInt;
+                                       RowStart:    LongInt;
+                                       Codepage:    LongInt  );
+var
+  P:         PChar;
+  NextP:     PChar;
+  Element:   TTextElement;
+  InsideDBC: boolean;
+begin
+  if ( Offset > 0 ) and
+     ( Codepage in [ 932, 936, 942, 943, 949, 950, 1381, 1386 ]) then
+  begin
+    P := TextPointer + RowStart;
+    InsideDBC := false;
+    // Because parsing of byte types is state based, we must verify every
+    // byte's type from the beginning of the line until we reach the target.
+    while RowStart < Index do
+    begin
+      Element := ExtractNextTextElement( P, NextP );
+      CheckSpecialElementType( Element.Character, Element.ElementType, InsideDBC, Codepage );
+      P := NextP;
+      inc( RowStart );
+    end;
+{
+    // We've reached the target position, and the current parsing state should
+    // be correctly set. So now we can safely determine the target byte's type.
+    Element := ExtractNextTextElement( P, NextP );
+    CheckSpecialElementType( Element.Character, Element.ElementType, InsideDBC, Codepage );
+}
+    if InsideDBC then
+    begin
+      // If this the first byte of a double byte character, move back by one.
+      dec( Index );
+      dec( Offset );
+    end;
+  end;
+end;
+//
+// ALT ends
 Initialization
 End.

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 418 for trunk/Components/RichTextDocumentUnit.pas

Legend:

trunk/Components/RichTextDocumentUnit.pas

Download in other formats: