restemplate/indy/Protocols/IdCoderHeader.pas

{
  $Project$
  $Workfile$
  $Revision$
  $DateUTC$
  $Id$

  This file is part of the Indy (Internet Direct) project, and is offered
  under the dual-licensing agreement described on the Indy website.
  (http://www.indyproject.org/)

  Copyright:
   (c) 1993-2005, Chad Z. Hower and the Indy Pit Crew. All rights reserved.


  $Log$


    Rev 1.13    9/8/2004 8:55:46 PM  JPMugaas
  Fix for compile problem where a char is being compared with an incompatible
  type in some compilers.


    Rev 1.12    02/07/2004 21:59:28  CCostelloe
  Bug fix


    Rev 1.11    17/06/2004 14:19:00  CCostelloe
  Bug fix for long subject lines that have characters needing CharSet encoding


    Rev 1.10    23/04/2004 20:33:04  CCostelloe
  Minor change to support From headers holding multiple addresses


    Rev 1.9    2004.02.03 5:44:58 PM  czhower
  Name changes


   Rev 1.8    24/01/2004 19:08:14  CCostelloe
 Cleaned up warnings


    Rev 1.7    1/22/2004 3:56:38 PM  SPerry
  fixed set problems


   Rev 1.6    2004.01.22 2:34:58 PM  czhower
  TextIsSame + D8 bug workaround


    Rev 1.5    10/16/2003 11:11:02 PM  DSiders
  Added localization comments.


    Rev 1.4    10/8/2003 9:49:36 PM  GGrieve
  Use IdDelete


    Rev 1.3    6/10/2003 5:48:46 PM  SGrobety
  DotNet updates


    Rev 1.2    04/09/2003 20:35:28  CCostelloe
  Parameter AUseAddressForNameIfNameMissing (defaulting to False to preserve
  existing code) added to EncodeAddressItem


    Rev 1.1    2003.06.23 9:46:52 AM  czhower
  Russian, Ukranian support for headers.


   Rev 1.0    11/14/2002 02:14:46 PM  JPMugaas
}
unit IdCoderHeader;

//refer http://www.faqs.org/rfcs/rfc2047.html

//TODO: Optimize and restructure code
//TODO: Redo this unit to fit with the new coders and use the exisiting MIME stuff

{
2002-08-21 JM Berg
 - brought in line with the RFC regarding
   whitespace between encoded words
 - added logic so that lines that already seem encoded are really encoded again
   (so that if a user types =?iso8859-1?Q?======?= its really encoded again
   and displayed like that on the other side)
2001-Nov-18 Peter Mee
 - Fixed multiple QP decoding in single header.
11-10-2001 - J. Peter Mugaas
  - tiny fix for 8bit header encoding suggested by Andrew P.Rybin
}

interface

{$i IdCompilerDefines.inc}

uses
  Classes,
  IdComponent,
  IdEMailAddress,
  IdHeaderCoderBase;

// Procs
  function EncodeAddressItem(EmailAddr: TIdEmailAddressItem; const HeaderEncoding: Char;
    const MimeCharSet: string; AUseAddressForNameIfNameMissing: Boolean = False): string;
  function EncodeHeader(const Header: string; Specials: String; const HeaderEncoding: Char;
   const MimeCharSet: string): string;
  function EncodeAddress(EmailAddr: TIdEMailAddressList; const HeaderEncoding: Char;
    const MimeCharSet: string; AUseAddressForNameIfNameMissing: Boolean = False): string;
  function DecodeHeader(const Header: string): string;
  procedure DecodeAddress(EMailAddr: TIdEmailAddressItem);
  procedure DecodeAddresses(AEMails: String; EMailAddr: TIdEmailAddressList);

implementation

uses
  IdException,
  IdGlobal,
  IdGlobalProtocols,
  IdAllHeaderCoders,
  SysUtils;

const
  csAddressSpecials: String = '()[]<>:;.,@\"';  {Do not Localize}

  base64_tbl: array [0..63] of Char = (
    'A','B','C','D','E','F','G','H',     {Do not Localize}
    'I','J','K','L','M','N','O','P',      {Do not Localize}
    'Q','R','S','T','U','V','W','X',      {Do not Localize}
    'Y','Z','a','b','c','d','e','f',      {Do not Localize}
    'g','h','i','j','k','l','m','n',      {Do not Localize}
    'o','p','q','r','s','t','u','v',       {Do not Localize}
    'w','x','y','z','0','1','2','3',       {Do not Localize}
    '4','5','6','7','8','9','+','/');      {Do not Localize}

function EncodeAddressItem(EmailAddr: TIdEmailAddressItem; const HeaderEncoding: Char;
  const MimeCharSet: string; AUseAddressForNameIfNameMissing: Boolean = False): string;
var
  S : string;
  I : Integer;
  NeedEncode : Boolean;
begin
  if AUseAddressForNameIfNameMissing and (EmailAddr.Name = '') then begin
    {CC: Use Address as Name...}
    EmailAddr.Name := EmailAddr.Address;
  end;
  if EmailAddr.Name <> '' then  {Do not Localize}
  begin
    NeedEncode := False;
    for I := 1 to Length(EmailAddr.Name) do begin
      if (EmailAddr.Name[I] < #32) or (EmailAddr.Name[I] >= #127) then
      begin
        NeedEncode := True;
        Break;
      end;
    end;
    if NeedEncode then begin
      S := EncodeHeader(EmailAddr.Name, csAddressSpecials, HeaderEncoding, MimeCharSet);
    end else begin
      { quoted string }
      S := '"';           {Do not Localize}
      for I := 1 to Length(EmailAddr.Name) do
      begin              { quote special characters }
        if (EmailAddr.Name[I] = '\') or (EmailAddr.Name[I] = '"') then begin
          S := S + '\';    {Do not Localize}
        end;
        S := S + EmailAddr.Name[I];
      end;
      S := S + '"';   {Do not Localize}
    end;
    Result := IndyFormat('%s <%s>', [S, EmailAddr.Address])    {Do not Localize}
  end
  else begin
    Result := IndyFormat('%s', [EmailAddr.Address]);     {Do not Localize}
  end;
end;

function B64(AChar: Char): Byte;
//TODO: Make this use the more efficient MIME Coder
begin
  for Result := Low(base64_tbl) to High(base64_tbl) do begin
    if AChar = base64_tbl[Result] then begin
      Exit;
    end;
  end;
  Result := 0;
end;

function DecodeHeader(const Header: string): string;
var
  HeaderCharSet, HeaderEncoding, HeaderData, S: string;
  LDecoded: Boolean;
  LStartPos, LLength, LEncodingStartPos, LEncodingEndPos, LLastStartPos: Integer;
  LLastWordWasEncoded: Boolean;
  Buf: TIdBytes;

  function ExtractEncoding(const AHeader: string; const AStartPos: Integer;
    var VStartPos, VEndPos: Integer; var VCharSet, VEncoding, VData: String): Boolean;
  var
    LCharSet, LEncoding, LData, LDataEnd: Integer;
  begin
    Result := False;

    //we need a '=? followed by 2 question marks followed by a '?='.    {Do not Localize}
    //to find the end of the substring, we can't just search for '?=',    {Do not Localize}
    //example: '=?ISO-8859-1?Q?=E4?='    {Do not Localize}

    LCharSet := PosIdx('=?', AHeader, AStartPos);  {Do not Localize}
    if (LCharSet = 0) or (LCharSet > VEndPos) then begin
      Exit;
    end;
    Inc(LCharSet, 2);

    LEncoding := PosIdx('?', AHeader, LCharSet);  {Do not Localize}
    if (LEncoding = 0) or (LEncoding > VEndPos) then begin
      Exit;
    end;
    Inc(LEncoding);

    LData := PosIdx('?', AHeader, LEncoding);  {Do not Localize}
    if (LData = 0) or (LData > VEndPos) then begin
      Exit;
    end;
    Inc(LData);

    LDataEnd := PosIdx('?=', AHeader, LData);  {Do not Localize}
    if (LDataEnd = 0) or (LDataEnd > VEndPos) then begin
      Exit;
    end;
    Inc(LDataEnd);

    VStartPos := LCharSet-2;
    VEndPos := LDataEnd;
    VCharSet := Copy(AHeader, LCharSet, LEncoding-LCharSet-1);
    VEncoding := Copy(AHeader, LEncoding, LData-LEncoding-1);
    VData := Copy(AHeader, LData, LDataEnd-LData-1);

    Result := True;
  end;

  // TODO: use TIdCoderQuotedPrintable and TIdCoderMIME instead
  function ExtractEncodedData(const AEncoding, AData: String; var VDecoded: TIdBytes): Boolean;
  var
    I, J: Integer;
    a3: TIdBytes;
    a4: array [0..3] of Byte;
  begin
    Result := False;
    SetLength(VDecoded, 0);
    case PosInStrArray(AEncoding, ['Q', 'B', '8'], False) of {Do not Localize}
      0: begin // quoted-printable
        I := 1;
        while I <= Length(AData) do begin
          if AData[i] = '_' then begin {Do not Localize}
            AppendByte(VDecoded, Ord(' '));    {Do not Localize}
          end
          else if (AData[i] = '=') and (Length(AData) >= (i+2)) then begin //make sure we can access i+2
            AppendByte(VDecoded, IndyStrToInt('$' + Copy(AData, i+1, 2), 32));   {Do not Localize}
            Inc(I, 2);
          end else
          begin
            AppendByte(VDecoded, Ord(AData[i]));
          end;
          Inc(I);
        end;
        Result := True;
      end;
      1: begin // base64
        J := Length(AData) div 4;
        if J > 0 then
        begin
          SetLength(a3, 3);
          for I := 0 to J-1 do
          begin
            a4[0] := B64(AData[(I*4)+1]);
            a4[1] := B64(AData[(I*4)+2]);
            a4[2] := B64(AData[(I*4)+3]);
            a4[3] := B64(AData[(I*4)+4]);

            a3[0] := Byte((a4[0] shl 2) or (a4[1] shr 4));
            a3[1] := Byte((a4[1] shl 4) or (a4[2] shr 2));
            a3[2] := Byte((a4[2] shl 6) or (a4[3] shr 0));

            if AData[(I*4)+4] = '=' then begin
              if AData[(I*4)+3] = '=' then begin
                AppendByte(VDecoded, a3[0]);
              end else begin
                AppendBytes(VDecoded, a3, 0, 2);
              end;
              Break;
            end else begin
              AppendBytes(VDecoded, a3, 0, 3);
            end;
          end;
        end;
        Result := True;
      end;
      2: begin // 8-bit
        {$IFDEF STRING_IS_ANSI}
        if AData <> '' then begin
          VDecoded := RawToBytes(AData[1], Length(AData));
        end;
        {$ELSE}
        VDecoded := IndyTextEncoding_8Bit.GetBytes(AData);
        {$ENDIF}
        Result := True;
      end;
    end;
  end;

begin
  Result := Header;

  LStartPos := 1;
  LLength := Length(Result);

  LLastWordWasEncoded := False;
  LLastStartPos := LStartPos;

  while LStartPos <= LLength do
  begin
    // valid encoded words can not contain spaces
    // if the user types something *almost* like an encoded word,
    // and its sent as-is, we need to find this!!
    LStartPos := FindFirstNotOf(LWS, Result, LLength, LStartPos);
    if LStartPos = 0 then begin
      Break;
    end;
    LEncodingEndPos := FindFirstOf(LWS, Result, LLength, LStartPos);
    if LEncodingEndPos <> 0 then begin
      Dec(LEncodingEndPos);
    end else begin
      LEncodingEndPos := LLength;
    end;
    if ExtractEncoding(Result, LStartPos, LEncodingStartPos, LEncodingEndPos, HeaderCharSet, HeaderEncoding, HeaderData) then
    begin
      LDecoded := False;
      if ExtractEncodedData(HeaderEncoding, HeaderData, Buf) then begin
        LDecoded := DecodeHeaderData(HeaderCharSet, Buf, S);
      end;
      if LDecoded then
      begin
        //replace old substring in header with decoded string,
        // ignoring whitespace that separates encoded words:
        if LLastWordWasEncoded then begin
          Result := Copy(Result, 1, LLastStartPos - 1) + S + Copy(Result, LEncodingEndPos + 1, MaxInt);
          LStartPos := LLastStartPos + Length(S);
        end else begin
          Result := Copy(Result, 1, LEncodingStartPos - 1) + S + Copy(Result, LEncodingEndPos + 1, MaxInt);
          LStartPos := LEncodingStartPos + Length(S);
        end;
      end else
      begin
        // could not decode the data, so preserve it in case the user
        // wants to do it manually.  Though, they really should use the
        // IdHeaderCoderBase.GHeaderDecodingNeeded hook for that instead...
        LStartPos := LEncodingEndPos + 1;
      end;
      LLength := Length(Result);
      LLastWordWasEncoded := True;
      LLastStartPos := LStartPos;
    end else
    begin
      LStartPos := FindFirstOf(LWS, Result, LLength, LStartPos);
      if LStartPos = 0 then begin
        Break;
      end;
      LLastWordWasEncoded := False;
    end;
  end;
end;

procedure DecodeAddress(EMailAddr : TIdEmailAddressItem);
begin
  EMailAddr.Name := UnquotedStr(DecodeHeader(EMailAddr.Name));
end;

procedure DecodeAddresses(AEMails : String; EMailAddr: TIdEmailAddressList);
var
  idx : Integer;
begin
  EMailAddr.EMailAddresses := AEMails;
  for idx := 0 to EMailAddr.Count-1 do begin
    DecodeAddress(EMailAddr[idx]);
  end;
end;

function EncodeAddress(EmailAddr: TIdEMailAddressList; const HeaderEncoding: Char;
  const MimeCharSet: string; AUseAddressForNameIfNameMissing: Boolean = False): string;
var
  idx : Integer;
begin
  if EmailAddr.Count > 0 then begin
    Result := EncodeAddressItem(EMailAddr[0], HeaderEncoding, MimeCharSet, AUseAddressForNameIfNameMissing);
    for idx := 1 to EmailAddr.Count-1 do begin
      Result := Result + ', ' +    {Do not Localize}
        EncodeAddressItem(EMailAddr[idx], HeaderEncoding, MimeCharSet, AUseAddressForNameIfNameMissing);
    end;
  end else begin
    Result := '';      {Do not Localize}
  end;
end;

{ encode a header field if non-ASCII characters are used }
function EncodeHeader(const Header: string; Specials: String; const HeaderEncoding: Char;
  const MimeCharSet: string): string;
const
  SPACES = [Ord(' '), 9, 13, 10];    {Do not Localize}
var
  T: string;
  Buf: TIdBytes;
  L, P, Q, R: Integer;
  B0, B1, B2: Integer;
  InEncode: Integer;
  NeedEncode: Boolean;
  csNoEncode, csNoReqQuote, csSpecials: TIdBytes;
  BeginEncode, EndEncode: string;

  procedure EncodeWord(AP: Integer);
  const
    MaxEncLen = 75;
  var
    LQ: Integer;
    EncLen: Integer;
    Enc1: string;
  begin
    T := T + BeginEncode;
    if L < AP then AP := L + 1;
    LQ := InEncode;
    InEncode := -1;
    EncLen := Length(BeginEncode) + 2;

    case PosInStrArray(HeaderEncoding, ['Q', 'B'], False) of {Do not Localize}
      0: begin { quoted-printable }
        while LQ < AP do
        begin
          if Buf[LQ] = Ord(' ') then begin {Do not Localize}
            Enc1 := '_';  {Do not Localize}
          end
          else if (not ByteIsInSet(Buf, LQ, csNoReqQuote)) or ByteIsInSet(Buf, LQ, csSpecials) then begin
            Enc1 := '=' + IntToHex(Buf[LQ], 2);     {Do not Localize}
          end
          else begin
            Enc1 := Char(Buf[LQ]);
          end;
          if (EncLen + Length(Enc1)) > MaxEncLen then begin
            //T := T + EndEncode + #13#10#9 + BeginEncode;
            //CC: The #13#10#9 above caused the subsequent call to FoldWrapText to
            //insert an extra #13#10 which, being a blank line in the headers,
            //was interpreted by email clients, etc., as the end of the headers
            //and the start of the message body.  FoldWrapText seems to look for
            //and treat correctly the sequence #13#10 + ' ' however...
            T := T + EndEncode + EOL + ' ' + BeginEncode;
            EncLen := Length(BeginEncode) + 2;
          end;
          T := T + Enc1;
          Inc(EncLen, Length(Enc1));
          Inc(LQ);
        end;
      end;
      1: begin { base64 }
        while LQ < AP do begin
          if (EncLen + 4) > MaxEncLen then begin
            //T := T + EndEncode + #13#10#9 + BeginEncode;
            //CC: The #13#10#9 above caused the subsequent call to FoldWrapText to
            //insert an extra #13#10 which, being a blank line in the headers,
            //was interpreted by email clients, etc., as the end of the headers
            //and the start of the message body.  FoldWrapText seems to look for
            //and treat correctly the sequence #13#10 + ' ' however...
            T := T + EndEncode + EOL + ' ' + BeginEncode;
            EncLen := Length(BeginEncode) + 2;
          end;

          B0 := Buf[LQ];
          case AP - LQ of
            1:
              begin
                T := T + base64_tbl[B0 shr 2] + base64_tbl[B0 and $03 shl 4] + '==';  {Do not Localize}
              end;
            2:
              begin
                B1 := Buf[LQ + 1];
                T := T + base64_tbl[B0 shr 2] +
                  base64_tbl[B0 and $03 shl 4 + B1 shr 4] +
                  base64_tbl[B1 and $0F shl 2] + '=';  {Do not Localize}
              end;
            else
              begin
                B1 := Buf[LQ + 1];
                B2 := Buf[LQ + 2];
                T := T + base64_tbl[B0 shr 2] +
                  base64_tbl[B0 and $03 shl 4 + B1 shr 4] +
                  base64_tbl[B1 and $0F shl 2 + B2 shr 6] +
                  base64_tbl[B2 and $3F];
              end;
          end;
          Inc(EncLen, 4);
          Inc(LQ, 3);
        end;
      end;
    end;
    T := T + EndEncode;
  end;

  function CreateEncodeRange(AStart, AEnd: Byte): TIdBytes;
  var
    I: Integer;
  begin
    SetLength(Result, AEnd-AStart+1);
    for I := 0 to Length(Result)-1 do begin
      Result[I] := AStart+I;
    end;
  end;

begin
  if Header = '' then begin
    Result := '';
    Exit;
  end;

  // TODO: this function needs to take encoded codeunits into account when
  // deciding where to split the encoded data between adjacent encoded-words,
  // so that a single encoded character does not get split between encoded-words
  // thus corrupting that character...

  Buf := EncodeHeaderData(MimeCharSet, Header);

  {Suggested by Andrew P.Rybin for easy 8bit support}
  if HeaderEncoding = '8' then begin {Do not Localize}
    Result := BytesToStringRaw(Buf);
    Exit;
  end;//if

  // RLebeau 1/7/09: using Char() for #128-#255 because in D2009, the compiler
  // may change characters >= #128 from their Ansi codepage value to their true
  // Unicode codepoint value, depending on the codepage used for the source code.
  // For instance, #128 may become #$20AC...

  // RLebeau 2/12/09: changed the logic to use "no-encode" sets instead, so
  // that words containing codeunits outside the ASCII range are always
  // encoded.  This is easier to manage when Unicode data is involved.

  csNoEncode := CreateEncodeRange(32, 126);

  csNoReqQuote := CreateEncodeRange(33, 60);
  AppendByte(csNoReqQuote, 62);
  AppendBytes(csNoReqQuote, CreateEncodeRange(64, 94));
  AppendBytes(csNoReqQuote, CreateEncodeRange(96, 126));

  csSpecials := ToBytes(Specials, IndyTextEncoding_8Bit);

  BeginEncode := '=?' + MimeCharSet + '?' + HeaderEncoding + '?';    {Do not Localize}
  EndEncode := '?=';  {Do not Localize}

  // JMBERG: We want to encode stuff that the user typed
  // as if it already is encoded!!
  if DecodeHeader(Header) <> Header then begin
    RemoveBytes(csNoEncode, 1, ByteIndex(Ord('='), csNoEncode));
  end;

  L := Length(Buf);
  P := 0;
  T := '';  {Do not Localize}
  InEncode := -1;
  while P < L do
  begin
    Q := P;
    while (P < L) and (Buf[P] in SPACES) do begin
      Inc(P);
    end;
    R := P;
    NeedEncode := False;
    while (P < L) and (not (Buf[P] in SPACES)) do begin
      if (not ByteIsInSet(Buf, P, csNoEncode)) or ByteIsInSet(Buf, P, csSpecials) then begin
        NeedEncode := True;
      end;
      Inc(P);
    end;
    if NeedEncode then begin
      if InEncode = -1 then begin
        T := T + BytesToString(Buf, Q, R - Q);
        InEncode := R;
      end;
    end else
    begin
      if InEncode <> -1 then begin
        EncodeWord(Q);
      end;
      T := T + BytesToString(Buf, Q, P - Q);
    end;
  end;
  if InEncode <> -1 then begin
    EncodeWord(P);
  end;
  Result := T;
end;

end.