program scraper;

{
  Finds all the attribute values that match the command-line 
  arguments, and writes them to the console.

  If you you specify "-" for a filename, it reads input from stdin,

  ( Only works for standard HTML-4 tags and attributes )

  For example: 
    scraper A HREF index.html
    ... Will extract all the <a href="">  links in "index.html" 
}


{$IFDEF FPC}{$H+}{$MODE OBJFPC}{$ENDIF}
{$IFDEF WIN32}{$APPTYPE CONSOLE}{$ENDIF}

uses tidyobj, {$IFDEF FPC}strings{$ELSE}SysUtils{$ENDIF};


procedure AttrProc(const info:tNodeInfo; name:pChar; value:pChar; req_attr_name:pointer);
begin
  if ( name <> nil ) then begin
    if ( req_attr_name = nil )  then WriteLn(name, '="', value, '"')
    else if ( StrIComp(name, req_attr_name) = 0  ) then WriteLn(value);
  end;
end;


procedure NodeProc(const info:tNodeInfo; req_attr_name:pointer);
begin
  ForEachAttr(info, @AttrProc, req_attr_name);
  if ( req_attr_name = nil ) then WriteLn;
end;


procedure Usage(msg:string);
begin
  WriteLn;
  if ( msg <> '' ) then WriteLn(msg);
  WriteLn;
  WriteLn('Usage:');
  WriteLn('  scraper <element-name> <attribute-name> <input-file>');
  WriteLn;
  WriteLn('Example (lists images):');
  WriteLn('  scraper IMG SRC index.html');
  WriteLn;
  WriteLn( 'Use "@" for wildcard to match any tag or attribute.');
  WriteLn( 'Use "-" for input-file to read from standard input.');
  WriteLn;
  HALT;
end;

var
  Tidy:tTidy;
  tag_id:TidyTagID;
  attr_id:TidyAttrID;
  attr_name:pChar;
  Filename:string;
begin
  if ( ParamCount <> 3 ) then Usage('');

  tag_id:=TagNameToTagID(pChar(ParamStr(1)));
  attr_id:=AttrNameToAttrID(pChar(ParamStr(2)));
  Filename:=ParamStr(3);
  if ( Filename = '-' ) then Filename:='';
  
  if (tag_id = TidyTag_UNKNOWN) and (ParamStr(1) <> '@') then Usage('Unknown tag name: "' + ParamStr(1) + '"');
  if (attr_id = TidyAttr_UNKNOWN) and (ParamStr(2) <> '@')  then Usage('Unknown attribute name: "' + ParamStr(2) + '"');
  if ( Filename = '' ) then Usage('No filename specified');

  if (ParamStr(2) <> '@')  then attr_name:=StrNew(pChar(ParamStr(2))) else attr_name:=nil;

  Tidy:=tTidy.Create(nil);

  Tidy.ForceOutput:=True;
  
  if ( Tidy.ParseFile(Filename) <> '' ) then ForEachTag(Tidy.Handle, Tidy.RootNode, tag_id, @NodeProc, attr_name);
  Tidy.Free;
end.



Get CurlPas and TidyPas at SourceForge.net. Fast, secure and Free Open Source software downloads