Ignore:
Timestamp:
Feb 25, 2019, 8:34:42 PM (6 years ago)
Author:
ataylor
Message:

Experimental new logic to try and fix DBCS text wrapping.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/Components/RichTextDocumentUnit.pas

    r405 r418  
    3636  end;
    3737
    38   TTextElementType = ( teText, // a character
     38  TTextElementType = ( teText,              // a character
    3939                       teWordBreak,
    40                        teLineBreak, // end of para
     40                       teLineBreak,         // end of para
    4141                       teTextEnd,
    4242                       teImage,
    43                        teStyle );
     43                       teStyle,
     44                       teWrapChar,          // A non-whitespace wrappable character (ALT)
     45                       teLeadByte,          // DBCS lead byte (ALT)
     46                       teSecondByte );      // DBCS secondary byte (ALT)
    4447
    4548  TTextElement = record
     
    113116                                Buffer: PChar;
    114117                                BufferLength: longint ): longint;
     118
     119
     120// ALT begins
     121//
     122
     123// Check for special text element types and adjust value accordingly.
     124procedure CheckSpecialElementType( const Character:   Char;
     125                                   var   ElementType: TTextElementType;
     126                                   var   InsideDBC:   Boolean;
     127                                   const Codepage:    LongInt );
     128
     129// Returns true if the given byte value is a legally-wrappable single-byte
     130// character under the given Asian codepage.
     131function IsAsianWrapChar( const CharByte: Byte;
     132                          const Codepage: LongInt ): boolean;
     133
     134// Returns true if the given byte value is the leading byte of a multi-byte
     135// character under the given Asian codepage.
     136function IsDBCSLeadByte( const CharByte: Byte;
     137                         const Codepage: LongInt ): boolean;
     138
     139// Returns true if the given byte value is valid as a possible second byte of
     140// a multi-byte character (this does not guarantee that it IS one, just that
     141// it COULD be).
     142function IsDBCSSecondByte( const CharByte: Byte;
     143                           const Codepage: LongInt ): boolean;
     144
     145// Adjusts the character position to the beginning of any multi-byte character.
     146procedure MoveToCharacterBoundary(     TextPointer: PChar;
     147                                   var Index:       LongInt;
     148                                   var Offset:      LongInt;
     149                                       RowStart:    LongInt;
     150                                       Codepage:    LongInt );
     151//
     152// ALT ends
     153
    115154
    116155Implementation
     
    349388var
    350389  TheChar: Char;
    351   NextChar: char;
     390  NextChar: Char;
    352391begin
    353392  with Result do
     
    356395    Character := TheChar;
    357396    inc( TextPointer );
     397
    358398
    359399    case TheChar of
     
    404444      end;
    405445
     446//    '-': // ---- Hyphen (ALT)
     447//      ElementType := teWrapChar;
     448
    406449      else
    407450        ElementType := teText;
    408451    end;
     452
    409453  end; // with
     454
    410455  NextElement := TextPointer;
    411456end;
     
    485530        end;
    486531      end
     532
     533//    '-': // ---- Hyphen (ALT)
     534//      ElementType := teWrapChar;
     535
    487536      else
    488537        ElementType := teText;
     
    802851end;
    803852
     853// ALT begins
     854//
     855// Check for special text element types that depend on context.
     856//
     857procedure CheckSpecialElementType( const Character:   Char;
     858                                   var   ElementType: TTextElementType;
     859                                   var   InsideDBC:   Boolean;
     860                                   const Codepage:    LongInt );
     861var
     862  CharByte: Byte;
     863begin
     864  if Codepage in [ 874, 932, 936, 942, 943, 949, 950, 1381, 1386 ] then
     865  begin
     866    CharByte := ord( Character );
     867    if InsideDBC then
     868    begin
     869        InsideDBC := false;
     870        // sanity check for corrupt text sequence (definitely not foolproof)
     871        if IsDBCSSecondByte( CharByte, Codepage ) then
     872          ElementType := teSecondByte
     873        else
     874          ElementType := teText;
     875    end
     876    else
     877    begin
     878      if IsAsianWrapChar( CharByte, Codepage ) then
     879      begin
     880        ElementType := teWrapChar;
     881        InsideDBC := false;
     882      end
     883      else if IsDBCSLeadByte( CharByte, Codepage ) then
     884      begin
     885        ElementType := teLeadByte;
     886        InsideDBC := true;
     887      end;
     888    end;
     889  end;
     890end;
     891
     892// Check if this (single-byte) character is a legal wrap point under certain
     893// Asian codepages. This is really only used for Thai and for Japanese
     894// half-width katakana; other DBCS languages use double-byte characters for all
     895// their native glyphs.
     896//
     897function IsAsianWrapChar( const CharByte: Byte;
     898                          const Codepage: LongInt ): boolean;
     899begin
     900    Result := false;
     901
     902    if ( CharByte < $80) then
     903      exit;
     904
     905    case Codepage of
     906      932, 942, 943:        // Japanese
     907        if CharByte in [ $A2, $A6, $B1..$DD ] then
     908          Result := true;
     909      874:                  // Thai
     910        Result := true;
     911    end;
     912end;
     913
     914// Check if this is the lead byte of a double-byte character. This is essential
     915// to know in certain cases:
     916//  - Nothing must ever be inserted between such a byte and the next byte
     917//    (e.g. line break, tag, etc).
     918//  - Cursor position must never be set between such a byte and the next byte.
     919//  - Selection state must never change between such a byte and the next byte.
     920//
     921function IsDBCSLeadByte( const CharByte: Byte;
     922                         const Codepage: LongInt ): boolean;
     923begin
     924    Result := false;
     925
     926    case Codepage of
     927      932, 942, 943:        // Japanese
     928        if CharByte in [ $81..$9F, $E0..$FC ] then
     929          Result := true;
     930      949:                  // Korean KSC
     931        if CharByte in [ $85..$FE ] then
     932          Result := true;
     933      1381:                 // Chinese GB2312
     934        if CharByte in [ $8C..$FE ] then
     935          Result := true;
     936      936, 950, 1386:       // Chinese BIG-5 or GBK
     937        if CharByte in [ $81..$FE ] then
     938          Result := true;
     939    end;
     940end;
     941
     942// Check to see if this byte is a valid second byte in a double-byte character.
     943// (This doesn't guarantee that it IS such a byte, only that it COULD be. The
     944// caller is assumed to know whether we're in a double byte character or not.)
     945//
     946function IsDBCSSecondByte( const CharByte: Byte;
     947                           const Codepage: LongInt ): boolean;
     948begin
     949    Result := false;
     950
     951    case Codepage of
     952      932, 936, 942, 943, 949, 950, 1386:
     953        if CharByte >= $40 then
     954          Result := true;
     955      1381:
     956        if CharByte >= $A1 then
     957          Result := true;
     958    end;
     959end;
     960
     961// Given a string position, check to see if it's in the middle of a double-byte
     962// character; if so, move back by one position so that we're sitting immediately
     963// in front of the double-byte character instead.
     964//
     965procedure MoveToCharacterBoundary(     TextPointer: PChar;
     966                                   var Index:       LongInt;
     967                                   var Offset:      LongInt;
     968                                       RowStart:    LongInt;
     969                                       Codepage:    LongInt  );
     970var
     971  P:         PChar;
     972  NextP:     PChar;
     973  Element:   TTextElement;
     974  InsideDBC: boolean;
     975begin
     976  if ( Offset > 0 ) and
     977     ( Codepage in [ 932, 936, 942, 943, 949, 950, 1381, 1386 ]) then
     978  begin
     979    P := TextPointer + RowStart;
     980    InsideDBC := false;
     981
     982    // Because parsing of byte types is state based, we must verify every
     983    // byte's type from the beginning of the line until we reach the target.
     984    while RowStart < Index do
     985    begin
     986      Element := ExtractNextTextElement( P, NextP );
     987      CheckSpecialElementType( Element.Character, Element.ElementType, InsideDBC, Codepage );
     988      P := NextP;
     989      inc( RowStart );
     990    end;
     991{
     992    // We've reached the target position, and the current parsing state should
     993    // be correctly set. So now we can safely determine the target byte's type.
     994    Element := ExtractNextTextElement( P, NextP );
     995    CheckSpecialElementType( Element.Character, Element.ElementType, InsideDBC, Codepage );
     996}
     997    if InsideDBC then
     998    begin
     999      // If this the first byte of a double byte character, move back by one.
     1000      dec( Index );
     1001      dec( Offset );
     1002    end;
     1003  end;
     1004
     1005end;
     1006//
     1007// ALT ends
     1008
    8041009Initialization
    8051010End.
Note: See TracChangeset for help on using the changeset viewer.