unit uHTML2te;

interface

Function DecodeHTML (Const Org: String): String;

implementation

uses Classes, sysutils;

const
  MaxUmlaute  = 89;
  Entities   : array [0..MaxUmlaute] of string[10] =
   ('lt','gt','amp','quot','nbsp','Ccdil','ccdil','Ntilde',
   'ntilde','THORN','thorn','Yacute','yacute','yuml','szlig',
   'AElig','Aacute','Acirc','Agrave','Aring','Atilde','Auml',
   'aelig','aacute','acirc','agrave','aring','atilde','auml',
   'ETH','Eacute','Ecirc','Egrave','Euml','eth','eacute',
   'ecirc','egrave','euml','Iacute','Icirc','Igrave','Iuml',
   'iacute','icirc','igrave','iuml','Oacute','Ocirc',
   'Ograve','Oslash','Otilde','Ouml','oacute','ocirc',
   'ograve','oslash','otilde','ouml','Uacute','Ucirc','Ugrave',
   'Uuml','uacute','ucirc','ugrave','uuml','reg','copy',
   'plusmn','micro','para','middot','cent','pound','yen',
   'frac14','frac12','frac34','sup1','sup2','sup3','iquest',
   'deg','brvbar','sect','laquo','raquo','#132','#147');
  Umlaute : array [0..MaxUmlaute] of char =
   ('<','>','&','"',' ','','','','','','','','','','','','',
   '','','','','','','','','','','','','','','','','',
   '','','','','','','','','','','','','','','','','',
   '','','','','','','','','','','','','','','','','',
   '','','','','','','','','','','','','','','','','',
   '','','','','');

Function DecodeHTML (Const Org: String): String;

   Function GetPar(Const Ges, Such: String): String;
   Var p: Integer;
   begin
      Result := '';
      p := Pos(LowerCase(Such), LowerCase(Ges));
      If p > 0 then begin
         Result := Copy(Ges, p+Length(Such), Length(Ges));
         If Copy(Result, 1, 1) = '"' then begin
            Delete(Result, 1, 1);
            p := Pos('"', Result);
            If p > 0 then Delete(Result, p, Length(Result)-p+1)
         end
      end
   end;

Var p, i, j, k, M, pCRLF, pSpace, AnzCRs: Integer;
    Neu, Befehl, Args, Einschub, Lists: String;
    c, c2, Last: Char;
    b, bBody, bNoSpace, bNoSpaceAfter: Boolean;
begin
   SetLength(Result, Length(Org));
   p := 1; M := 0; pSpace := 0; pCRLF := 0; Last := #0;
   bBody := false; Lists := '';
   bNoSpace := true;
   bNoSpaceAfter := false;
   For i := 1 to Length(Org) do begin
      c := Org[i]; Neu := '';
      If c IN[' ',^I, #13, #10] then c := ' ';
      Case M of
         0: If c = '<' then begin Befehl := ''; Args := ''; Inc(M) end
            else If c = '&' then begin M := 5; Args := '' end
            else If (c <> Last) or (c <> ' ') then Neu := c;
         1: If c = '>' then Inc(M, 2)
            else If c = ' ' then Inc(M)
            else Befehl := Befehl + c;
         2: If c = '>' then Inc(M)
            else Args := Args + c;
         5: If Not (c IN['A'..'Z', 'a'..'z', '#', '0'..'9']) then Inc(M)
            else Args := Args + c;
      end;
      If M = 6 then begin
         For k := 0 to MaxUmlaute do begin
            Neu := '';
            If Entities[k] = Args then begin
               Neu := Umlaute[k]; break
            end;
            If Neu = '' then begin
               If Copy(Args, 1, 1) = '#'
                  then Neu := Chr(StrToInt('0'+Copy(Args, 2, Length(Args))))
                  else Neu := '&'+Args+c
            end
         end;
         M := 0
      end else
      If M = 3 then begin
         M := 0;
         Befehl := LowerCase(Befehl);
         If (Befehl = 'body') or (Befehl = 'html') then begin
            bBody := true; p := 1
         end
         else
         If (Befehl = '/body') or (Befehl = 'head') then bBody := false
         else
         If Befehl = 'a' then begin
            If GetPar(Args, 'href=') > '' then Neu := ' <'+GetPar(Args, 'href=')+'> '
            else If GetPar(Args, 'name=') > '' then Neu := #13#10#13#10'(Abschnitt: #'+GetPar(Args, 'name=')+')'#13#10;
            If Neu > '' then bNoSpaceAfter := true
         end
         else
         If Befehl = '/tr' then Neu := ^I
         else
         If (Befehl = 'ul') then Lists := Lists + #0
         else
         If (Befehl = 'ol') then Lists := Lists + #1
         else
         If (Befehl = 'li') and (Lists > '') then begin
            Einschub := '';
            For k := 1 to Length(Lists)-1 do Einschub := Einschub + '  ';
            If Lists[Length(Lists)] = #0 then
               Einschub := Einschub + '* '
            else begin
               Einschub := Einschub + IntToStr(Ord(Lists[Length(Lists)])) + '. ';
               Inc(Lists[Length(Lists)])
            end;
            bNoSpaceAfter := true;
            Neu := #13#10 + Einschub
         end
         else
         If ((Befehl = '/ul') or (Befehl = '/ol')) and (Lists > '') then begin
            SetLength(Lists, Length(Lists)-1);
            If Lists = '' then Einschub := ''
         end
         else
         If (Befehl = 'b') or (Befehl = '/b') then Neu := '*'
         else
         If (Befehl = 'u') or (Befehl = '/u') then Neu := '_'
         else
         If (Befehl = 'h1') or (Befehl = 'h2') or (Befehl = 'h3') or (Befehl = 'h4') or (Befehl = 'h5')
            then Neu := #13#10#13#10
         else
         If (Befehl = '/h1') or (Befehl = '/h2') or (Befehl = '/h3') or (Befehl = '/h4') or (Befehl = '/h5')
            or (Befehl = '/p') or (Befehl = 'p') or (Befehl = 'br') or (Befehl = '/td') or (Befehl = '/li')
              then Neu := #13#10#13#10
         else
         If Befehl = '/div' then
            Neu := #13#10'--------------------------'#13#10#13#10
      end;
      If bBody then begin
         For j := 1 to Length(Neu) do begin
            c2 := Neu[j];
            If (p - pCRLF > 75) and (pSpace > pCRLF) then begin
               For k := p-1 downto pSpace+1 do Result[k+1+Length(Einschub)] := Result[k];
               Result[pSpace] := #13;
               Result[pSpace+1] := #10;
               For k := 1 to Length(Einschub) do Result[pSpace+1+k] := ' ';
               Inc(p, 1+Length(Einschub));
               pCRLF := pSpace+1;
               bNoSpace := pSpace + 1 = p;
               pSpace := 0
            end;
            b := (c2 <> ' ') or (Not bNoSpace);
            If b and (c2 IN[#13, #10]) and (p>4) then begin
               If (Copy(Result, p-4, 4) = #13#10#13#10) then
                  b := false
               else begin
                  k := p-1;
                  AnzCRs := 0;
                  While (k > 1) and (AnzCRs < 3) do begin
                     Case Result[k] of
                        #13: Inc(AnzCRs);
                        #10, ' ', ^I: ;
                        else break
                     end;
                     Dec(k)
                  end;
                  b := AnzCRs < 3
               end
            end;
            If b then begin
               Case c2 of
                  ' ': pSpace := p;
                  #13, #10: pCRLF := p;
               end;
               If p > Length(Result) then SetLength(Result, p + 1000);
               Result[p] := c2; Inc(p);
               bNoSpace := c = ' '
            end
         end
      end;
      If Neu > '' then If Neu[Length(Neu)]=#10 then bNoSpace := true;
      If bNoSpaceAfter then begin
         bNoSpaceAfter := false;
         bNoSpace := true
      end;
      Last := c;
   end;
   SetLength(Result, p-1)
end;

{Var s: String;
begin
   With TFileStream.Create('test.html', fmOpenRead) do try
      SetLength(s, Size);
      read (s[1], Size)
   finally Free end;
   With TFileStream.Create('Ergebnis.txt', fmCreate) do try
      s := DecodeHTML(s);
      If s = '' then s := 'Kein Inhalt zwischen den Body-Tags bzw. keine Body-Tags...';
      Write (s[1], Length(s))
   finally Free end;
end.}

end.
