program scraper; { Finds all the attribute values that match the command-line arguments, and writes them to the console. If you you specify "-" for a filename, it reads input from stdin, ( Only works for standard HTML-4 tags and attributes ) For example: scraper A HREF index.html ... Will extract all the <a href=""> links in "index.html" } {$IFDEF FPC}{$H+}{$MODE OBJFPC}{$ENDIF} {$IFDEF WIN32}{$APPTYPE CONSOLE}{$ENDIF} uses tidyobj, {$IFDEF FPC}strings{$ELSE}SysUtils{$ENDIF}; procedure AttrProc(const info:tNodeInfo; name:pChar; value:pChar; req_attr_name:pointer); begin if ( name <> nil ) then begin if ( req_attr_name = nil ) then WriteLn(name, '="', value, '"') else if ( StrIComp(name, req_attr_name) = 0 ) then WriteLn(value); end; end; procedure NodeProc(const info:tNodeInfo; req_attr_name:pointer); begin ForEachAttr(info, @AttrProc, req_attr_name); if ( req_attr_name = nil ) then WriteLn; end; procedure Usage(msg:string); begin WriteLn; if ( msg <> '' ) then WriteLn(msg); WriteLn; WriteLn('Usage:'); WriteLn(' scraper <element-name> <attribute-name> <input-file>'); WriteLn; WriteLn('Example (lists images):'); WriteLn(' scraper IMG SRC index.html'); WriteLn; WriteLn( 'Use "@" for wildcard to match any tag or attribute.'); WriteLn( 'Use "-" for input-file to read from standard input.'); WriteLn; HALT; end; var Tidy:tTidy; tag_id:TidyTagID; attr_id:TidyAttrID; attr_name:pChar; Filename:string; begin if ( ParamCount <> 3 ) then Usage(''); tag_id:=TagNameToTagID(pChar(ParamStr(1))); attr_id:=AttrNameToAttrID(pChar(ParamStr(2))); Filename:=ParamStr(3); if ( Filename = '-' ) then Filename:=''; if (tag_id = TidyTag_UNKNOWN) and (ParamStr(1) <> '@') then Usage('Unknown tag name: "' + ParamStr(1) + '"'); if (attr_id = TidyAttr_UNKNOWN) and (ParamStr(2) <> '@') then Usage('Unknown attribute name: "' + ParamStr(2) + '"'); if ( Filename = '' ) then Usage('No filename specified'); if (ParamStr(2) <> '@') then attr_name:=StrNew(pChar(ParamStr(2))) else attr_name:=nil; Tidy:=tTidy.Create(nil); Tidy.ForceOutput:=True; if ( Tidy.ParseFile(Filename) <> '' ) then ForEachTag(Tidy.Handle, Tidy.RootNode, tag_id, @NodeProc, attr_name); Tidy.Free; end.