Found a very interesting blog on “Simple Web Crawler with SAS Macro and SAS Data Step” at http://www.sascommunity.org/wiki/Simple_Web_Crawler_with_SAS_Macro_and_SAS_Data_Step with the following code to strip url from the listed websites.
data work.links_to_crawl;
length url $256;
input url $;
datalines;
http://www.yahoo.com
http://www.sas.com
http://www.google.com
;
run;
%macro crawler();
%let html_num = 1;
data work.links_crawled;
length url $256;
run;
%next_crawl:
/* pop the next url off */
%let next_url = ;
data work.links_to_crawl;
set work.links_to_crawl;
if _n_ eq 1 then call symput(“next_url”, url);
else output;
run;
%let next_url = %trim(%left(&next_url));
%if “&next_url” ne “” %then %do;
%put crawling &next_url … ;
/* crawl the url */
filename _nexturl url “&next_url”;
/* put the file we crawled here */
filename htmlfile “file%trim(&html_num).html”;
/* find more urls */
data work._urls(keep=url);
length url $256 ;
file htmlfile;
infile _nexturl length=len;
input text $varying2000. len;
put text;
start = 1;
stop = length(text);
if _n_ = 1 then do;
retain patternID;
pattern = ‘/href=”([^”]+)”/i’;
patternID = prxparse(pattern);
end;
/* Use PRXNEXT to find the first instance of the pattern, */
/* then use DO WHILE to find all further instances. */
/* PRXNEXT changes the start parameter so that searching */
/* begins again after the last match. */
call prxnext(patternID, start, stop, text, position, length);
do while (position ^= 0);
url = substr(text, position+6, length-7);
* put url=;
output;
call prxnext(patternID, start, stop, text, position, length);
end;
run;
/* add the current link to the list of urls we have already crawled */
data work._old_link;
url = “&next_url”;
run;
proc append base=work.links_crawled data=work._old_link;
run;
/* only add urls that we haven’t already crawled or that aren’t queued up to be crawled */
proc sql noprint;
create table work._append as
select url
from work._urls
where url not in (select url from work.links_crawled)
and url not in (select url from work.links_to_crawl);
quit;
/* only add urls that are absolute (http://…) */
data work._append;
set work._append;
absolute_url = substrn(url, 1, 7);
put absolute_url=;
if absolute_url eq “http://” ;
drop absolute_url;
run;
/* add new links */
proc append base=work.links_to_crawl data=work._append force;
run;
/* increment our file number */
%let html_num = %eval(&html_num + 1);
/* loop */
%goto next_crawl;
%end;
%mend crawler;
%crawler();