program scraper;
{
Finds all the attribute values that match the command-line
arguments, and writes them to the console.
If you you specify "-" for a filename, it reads input from stdin,
( Only works for standard HTML-4 tags and attributes )
For example:
scraper A HREF index.html
... Will extract all the <a href=""> links in "index.html"
}
{$IFDEF FPC}{$H+}{$MODE OBJFPC}{$ENDIF}
{$IFDEF WIN32}{$APPTYPE CONSOLE}{$ENDIF}
uses tidyobj, {$IFDEF FPC}strings{$ELSE}SysUtils{$ENDIF};
procedure AttrProc(const info:tNodeInfo; name:pChar; value:pChar; req_attr_name:pointer);
begin
if ( name <> nil ) then begin
if ( req_attr_name = nil ) then WriteLn(name, '="', value, '"')
else if ( StrIComp(name, req_attr_name) = 0 ) then WriteLn(value);
end;
end;
procedure NodeProc(const info:tNodeInfo; req_attr_name:pointer);
begin
ForEachAttr(info, @AttrProc, req_attr_name);
if ( req_attr_name = nil ) then WriteLn;
end;
procedure Usage(msg:string);
begin
WriteLn;
if ( msg <> '' ) then WriteLn(msg);
WriteLn;
WriteLn('Usage:');
WriteLn(' scraper <element-name> <attribute-name> <input-file>');
WriteLn;
WriteLn('Example (lists images):');
WriteLn(' scraper IMG SRC index.html');
WriteLn;
WriteLn( 'Use "@" for wildcard to match any tag or attribute.');
WriteLn( 'Use "-" for input-file to read from standard input.');
WriteLn;
HALT;
end;
var
Tidy:tTidy;
tag_id:TidyTagID;
attr_id:TidyAttrID;
attr_name:pChar;
Filename:string;
begin
if ( ParamCount <> 3 ) then Usage('');
tag_id:=TagNameToTagID(pChar(ParamStr(1)));
attr_id:=AttrNameToAttrID(pChar(ParamStr(2)));
Filename:=ParamStr(3);
if ( Filename = '-' ) then Filename:='';
if (tag_id = TidyTag_UNKNOWN) and (ParamStr(1) <> '@') then Usage('Unknown tag name: "' + ParamStr(1) + '"');
if (attr_id = TidyAttr_UNKNOWN) and (ParamStr(2) <> '@') then Usage('Unknown attribute name: "' + ParamStr(2) + '"');
if ( Filename = '' ) then Usage('No filename specified');
if (ParamStr(2) <> '@') then attr_name:=StrNew(pChar(ParamStr(2))) else attr_name:=nil;
Tidy:=tTidy.Create(nil);
Tidy.ForceOutput:=True;
if ( Tidy.ParseFile(Filename) <> '' ) then ForEachTag(Tidy.Handle, Tidy.RootNode, tag_id, @NodeProc, attr_name);
Tidy.Free;
end.