Thema: Delphi HTML Tags entfernen

Einzelnen Beitrag anzeigen

tr909

Registriert seit: 5. Nov 2004
193 Beiträge
 
Turbo Delphi für Win32
 
#3

Re: HTML Tags entfernen

  Alt 16. Jan 2008, 11:04
Hatte gerade das gleiche Problem und habe nu provisorisch erst mal folgendes gebastelt und zusammengesucht
Delphi-Quellcode:
function GiveSZ(HCode: string): Char;
var
  i : Integer;
begin
  Result := ' ';
  if (HCode = '"') or (HCode = '"') then Result := '"';
  if (HCode = '&') or (HCode = '&') then Result := '&';
  if (HCode = '<') or (HCode = '<') then Result := '<';
  if (HCode = '>') or (HCode = '>') then Result := '>';
  // ISO 160 bis ISO 255 Codes
  if (HCode = '') or (HCode = ' ') then Result := ' ';
  if (HCode = '&iexl;') or (HCode = '¡') then Result := '¡';
  if (HCode = '&cent;') or (HCode = '¢') then Result := '¢';
  if (HCode = '&pound;') or (HCode = '£') then Result := '£';
  if (HCode = '&curren;') or (HCode = '¤') then Result := '¤';
  if (HCode = '&yen;') or (HCode = '¥') then Result := '¥';
  if (HCode = '&brkbar;') or (HCode = '¦') then Result := '¦';
  if (HCode = '&sect;') or (HCode = '§') then Result := '§';
  if (HCode = '&uml;') or (HCode = '¨') then Result := '¨';
  if (HCode = '&copy;') or (HCode = '©') then Result := '©';
  if (HCode = '&ordf;') or (HCode = 'ª') then Result := 'ª';
  if (HCode = '&laquo;') or (HCode = '«') then Result := '«';
  if (HCode = '&not;') or (HCode = '¬') then Result := '¬';
  if (HCode = '&shy;') or (HCode = '*') then Result := '*';
  if (HCode = '&reg;') or (HCode = '®') then Result := '®';
  if (HCode = '&hibar;') or (HCode = '¯') then Result := '¯';
  if (HCode = '&deg;') or (HCode = '°') then Result := '°';
  if (HCode = '&plusmn;') or (HCode = '±') then Result := '±';
  if (HCode = '&sup2;') or (HCode = '²') then Result := '²';
  if (HCode = '&sup3;') or (HCode = '³') then Result := '³';
  if (HCode = '&acute;') or (HCode = '´') then Result := '´';
  if (HCode = '&micro;') or (HCode = 'µ') then Result := 'µ';
  if (HCode = '&para;') or (HCode = '') then Result := '';
  if (HCode = '&middot;') or (HCode = '·') then Result := '·';
  if (HCode = '&cedil;') or (HCode = '¸') then Result := '¸';
  if (HCode = '&sup1;') or (HCode = '¹') then Result := '¹';
  if (HCode = '&ordm;') or (HCode = 'º') then Result := 'º';
  if (HCode = '&raquo;') or (HCode = '»') then Result := '»';
  if (HCode = '&frac14;') or (HCode = '¼') then Result := '¼';
  if (HCode = '&frac12;') or (HCode = '½') then Result := '½';
  if (HCode = '&frac34;') or (HCode = '¾') then Result := '¾';
  if (HCode = '&iquest;') or (HCode = '¿') then Result := '¿';
  if (HCode = '&Agrave;') or (HCode = 'À') then Result := 'À';
  if (HCode = '&Aacute;') or (HCode = 'Á') then Result := 'Á';
  if (HCode = '&Acirc;') or (HCode = 'Â') then Result := 'Â';
  if (HCode = '&Atilde;') or (HCode = 'Ã') then Result := 'Ã';
  if (HCode = '&Auml;') or (HCode = 'Ä') then Result := 'Ä';
  if (HCode = '&Aring;') or (HCode = 'Å') then Result := 'Å';
  if (HCode = '&AEling;') or (HCode = 'Æ') then Result := 'Æ';
  if (HCode = '&Ccedil;') or (HCode = 'Ç') then Result := 'Ç';
  if (HCode = '&Egrave;') or (HCode = 'È') then Result := 'È';
  if (HCode = '&Eacute;') or (HCode = 'É') then Result := 'É';
  if (HCode = '&Ecirce;') or (HCode = 'Ê') then Result := 'Ê';
  if (HCode = '&Euml;') or (HCode = 'Ë') then Result := 'Ë';
  if (HCode = '&Igrave;') or (HCode = 'Ì') then Result := 'Ì';
  if (HCode = '&Iacute;') or (HCode = 'Í') then Result := 'Í';
  if (HCode = '&Icirce;') or (HCode = 'Î') then Result := 'Î';
  if (HCode = '&Iuml;') or (HCode = 'Ï') then Result := 'Ï';
  if (HCode = '&ETH;') or (HCode = 'Ð') then Result := 'Ð';
  if (HCode = '&Ntilde;') or (HCode = 'Ñ') then Result := 'Ñ';
  if (HCode = '&Ograve;') or (HCode = 'Ò') then Result := 'Ò';
  if (HCode = '&Oacute;') or (HCode = 'Ó') then Result := 'Ó';
  if (HCode = '&Ocirc;') or (HCode = 'Ô') then Result := 'Ô';
  if (HCode = '&Otilde;') or (HCode = 'Õ') then Result := 'Õ';
  if (HCode = '&Ouml;') or (HCode = 'Ö') then Result := 'Ö';
  if (HCode = '&times;') or (HCode = '×') then Result := '×';
  if (HCode = '&Oslash;') or (HCode = 'Ø') then Result := 'Ø';
  if (HCode = '&Ugrave;') or (HCode = 'Ù') then Result := 'Ù';
  if (HCode = '&Uacute;') or (HCode = 'Ú') then Result := 'Ú';
  if (HCode = '&Ucirc;') or (HCode = 'Û') then Result := 'Û';
  if (HCode = '&Uuml;') or (HCode = 'Ü') then Result := 'Ü';
  if (HCode = '&Yacute;') or (HCode = 'Ý') then Result := 'Ý';
  if (HCode = '&THORN;') or (HCode = 'Þ') then Result := 'Þ';
  if (HCode = '&szlig;') or (HCode = 'ß') then Result := 'ß';
  if (HCode = '&agrave;') or (HCode = 'à') then Result := 'à';
  if (HCode = '&aacute;') or (HCode = 'á') then Result := 'á';
  if (HCode = '&acirc;') or (HCode = 'â') then Result := 'â';
  if (HCode = '&atilde;') or (HCode = 'ã') then Result := 'ã';
  if (HCode = '&auml;') or (HCode = 'ä') then Result := 'ä';
  if (HCode = '&aring;') or (HCode = 'å') then Result := 'å';
  if (HCode = '&aeling;') or (HCode = 'æ') then Result := 'æ';
  if (HCode = '&ccedil;') or (HCode = 'ç') then Result := 'ç';
  if (HCode = '&egrave;') or (HCode = 'è') then Result := 'è';
  if (HCode = '&eacute;') or (HCode = 'é') then Result := 'é';
  if (HCode = '&ecirc;') or (HCode = 'ê') then Result := 'ê';
  if (HCode = '&euml;') or (HCode = 'ë') then Result := 'ë';
  if (HCode = '&igrave;') or (HCode = 'ì') then Result := 'ì';
  if (HCode = '&iacute;') or (HCode = 'í') then Result := 'í';
  if (HCode = '&icirc;') or (HCode = 'î') then Result := 'î';
  if (HCode = '&iuml;') or (HCode = 'ï') then Result := 'ï';
  if (HCode = '&eth;') or (HCode = 'ð') then Result := 'ð';
  if (HCode = '&ntilde;') or (HCode = 'ñ') then Result := 'ñ';
  if (HCode = '&ograve;') or (HCode = 'ò') then Result := 'ò';
  if (HCode = '&oacute;') or (HCode = 'ó') then Result := 'ó';
  if (HCode = '&ocirc;') or (HCode = 'ô') then Result := 'ô';
  if (HCode = '&otilde;') or (HCode = 'õ') then Result := 'õ';
  if (HCode = '&ouml;') or (HCode = 'ö') then Result := 'ö';
  if (HCode = '&divide;') or (HCode = '÷') then Result := '÷';
  if (HCode = '&oslash;') or (HCode = 'ø') then Result := 'ø';
  if (HCode = '&ugrave;') or (HCode = 'ù') then Result := 'ù';
  if (HCode = '&uacude;') or (HCode = 'ú') then Result := 'ú';
  if (HCode = '&ucirc;') or (HCode = 'û') then Result := 'û';
  if (HCode = '&uuml;') or (HCode = 'ü') then Result := 'ü';
  if (HCode = '&yacute;') or (HCode = 'ý') then Result := 'ý';
  if (HCode = '&thorn;') or (HCode = 'þ') then Result := 'þ';
  if (HCode = '&yuml;') or (HCode = 'ÿ') then Result := 'ÿ';
  if Result = ' then
    begin
      delete(HCode, 1, 2);
      delete(HCode, length(HCode), 1);
      if TryStrToInt(HCode, i) then
        Result := Char(i);
    end;
end;

function ReplaceHTMLChar(sValue: string): string;
var
  tagStartPos : Integer;
  tagEndPos : Integer;
  tag, newTag : string;
  temp : string;
begin
  tagStartPos := Pos('&', sValue);
  tagEndPos := PosEx(';', sValue, tagStartPos);
  if tagEndPos - tagStartPos < 8 then
    begin
      tag := copy(sValue, tagStartPos, tagEndPos - tagStartPos + 1);
      newTag := GiveSZ(tag);
      temp := copy(sValue, 1, tagStartPos - 1) + newTag +
        copy(sValue, tagEndPos + 1, length(sValue) - tagEndPos);
      sValue := temp;
      tagEndPos := tagEndPos - length(tag) + length(newTag);
      while (PosEx('&', sValue, tagEndPos) <> 0) and
        (PosEx(';', sValue, tagEndPos) <> 0) do
        begin
          tagStartPos := PosEx('&', sValue, tagEndPos);
          tagEndPos := PosEx(';', sValue, tagStartPos);
          if tagEndPos - tagStartPos < 8 then
            begin
              tag := copy(sValue, tagStartPos, tagEndPos - tagStartPos + 1);
              newTag := GiveSZ(tag);
              temp := copy(sValue, 1, tagStartPos - 1) + newTag +
                copy(sValue, tagEndPos + 1, length(sValue) - tagEndPos);
              sValue := temp;
              tagEndPos := tagEndPos - length(tag) + length(newTag);
            end;
        end;
    end;
  Result := sValue;
end;

function Html2Txt(html: string): string;
var
  istag : boolean;
  i : Integer;
  ch : Char;
  temp : string;
  slRes : TStrings;
begin
  result := '';
      temp := '';
      istag := false;
      html := ReplaceHTMLChar(html);
      for i := 1 to length(html) do
        begin
          ch := html[i];
          if (ch = '<') and (istag = false) then
            begin
              istag := true;
              continue;
            end;
          if (ch = '>') and (istag = true) then
            begin
              istag := false;
              continue;
            end;
          if istag = false then
            temp := temp + ch;
        end;
      slRes := TStringList.Create;
      try
        slRes.Text := temp;
        for i := 0 to slRes.Count - 1 do
          slRes[i] := Trim(slRes[i]);
        while slRes.IndexOf('') <> -1 do
          slRes.delete(slRes.IndexOf(''));
      finally
        Result := slRes.Text;
        slRes.Free;
      end;
end;
einfach mit html2txt() aufrufen
Damit werden alle html-tags und scripte entfernt, sowie die html-sonderzeichen ersetzt.

Ich arbeite immo noch an einer Lösung mit regulären Ausdrücken.

Gruß
tr909
  Mit Zitat antworten Zitat