Einzelnen Beitrag anzeigen

LTE5

Registriert seit: 13. Nov 2017
355 Beiträge
 
Delphi 10.2 Tokyo Starter
 
#36

AW: Mal wieder Kodierungsprobleme. ANSI UTF8 UTF16

  Alt 18. Nov 2017, 21:28
Es gibt noch das hier aber wie man das anwendet ist mir ein Rätsel.
http://chsdet.sourceforge.net/

Sonst habe ich noch das hier gefunden, bin aber gerade unfähig eine ansi-Datei zu erstellen. Daher bekomme ich bei jeder Datei true zurück
Delphi-Quellcode:
function FileMayBeUTF8(FileName: WideString): Boolean;
var
 Stream: TMemoryStream;
 BytesRead: Integer;
 ArrayBuff: array [0 .. 127] of Byte;
 PreviousByte: Byte;
 i: Integer;
 YesSequences, NoSequences: Integer;

begin
 if not FileExists(FileName) then
  Exit;

 YesSequences := 0;
 NoSequences := 0;
 Stream := TMemoryStream.Create;
 try
  Stream.LoadFromFile(FileName);
  repeat

   {read from the TMemoryStream}

   BytesRead := Stream.Read(ArrayBuff, High(ArrayBuff) + 1);
   {Do the work on the bytes in the buffer}
   if BytesRead > 1 then
    begin
     for i := 1 to BytesRead - 1 do
      begin
       PreviousByte := ArrayBuff[i - 1];
       if ((ArrayBuff[i] and $C0) = $80) then
        begin
         if ((PreviousByte and $C0) = $C0) then
          begin
           inc(YesSequences)
          end
         else
          begin
           if ((PreviousByte and $80) = $0) then
            inc(NoSequences);
          end;
        end;
      end;
    end;
  until (BytesRead < (High(ArrayBuff) + 1));
  // Below, >= makes ASCII files = UTF-8, which is no problem.
  // Simple > would catch only UTF-8;
  Result := (YesSequences >= NoSequences);

 finally
  Stream.Free;
 end;
end;
Hier noch eine schöne Version. Gibt aber leider bei ANSI auch true zurück
Delphi-Quellcode:
function UTF8CharLength(const c: Byte): Integer;
begin
 // First Byte: 0xxxxxxx
 if ((c and $80) = $00) then
  begin
   Result := 1;
  end
  // First Byte: 110yyyyy
 else if ((c and $E0) = $C0) then
  begin
   Result := 2;
  end
  // First Byte: 1110zzzz
 else if ((c and $F0) = $E0) then
  begin
   Result := 3;
  end
  // First Byte: 11110uuu
 else if ((c and $F8) = $F0) then
  begin
   Result := 4;
  end
  // not valid, return the error value
 else
  begin
   Result := -1;
  end;
end;

function UTF8IsTrailChar(const c: Byte): Boolean;
begin
 // trail bytes have this form: 10xxxxxx
 Result := ((c and $C0) = $80);
end;

function IsUTF8Memory(AMem: PBYTE; ASize: Int64): Boolean;
var
 i: Int64;
 c: Integer;
begin
 Result := True;
 i := 0;
 while (i < ASize) do
  begin
   // get the length if the current UTF-8 character
   c := UTF8CharLength(AMem^);
   // check if it is valid and fits into ASize
   if ((c >= 1) and (c <= 4) and ((i + c - 1) < ASize)) then
    begin
     Inc(i, c);
     Inc(AMem);
     // if it is a multi-byte character, check the trail bytes
     while (c > 1) do
      begin
       if (not UTF8IsTrailChar(AMem^)) then
        begin
         Result := False;
         Break;
        end
       else
        begin
         Dec(c);
         Inc(AMem);
        end;
      end;
    end
   else
    begin
     Result := False;
    end;
   if (not Result) then
    Break;
  end;
end;

Geändert von LTE5 (18. Nov 2017 um 21:46 Uhr)
  Mit Zitat antworten Zitat