June 2016 - BI Practice

R Online Reference and Resources

reference site:

http:/www.bi-notes.com

Tutorial: From 0 to R with Google Analytics

https://developers.google.com/analytics/solutions/r-google-analytics

Blog

http://www.tmllr.com/?utm_source=analyticsdemystified.com&utm_medium=social&utm_content=tims_r_post&utm_campaign=gilliganisawesome

From level[1] to level[2] with R and Google Analytics: ggplot2

http://www.tmllr.com/from-level1-to-level2-with-r-and-google-analytics-ggplot2/

Crawling Websites

Found a very interesting blog on “Simple Web Crawler with SAS Macro and SAS Data Step” at http://www.sascommunity.org/wiki/Simple_Web_Crawler_with_SAS_Macro_and_SAS_Data_Step with the following code to strip url from the listed websites.

data work.links_to_crawl;
length url $256;
input url $;
datalines;
http://www.yahoo.com
http://www.sas.com
http://www.google.com
;
run;

%macro crawler();
%let html_num = 1;

data work.links_crawled;
length url $256;
run;

%next_crawl:
/* pop the next url off */
%let next_url = ;

data work.links_to_crawl;
set work.links_to_crawl;
if _n_ eq 1 then call symput(“next_url”, url);
else output;
run;

%let next_url = %trim(%left(&next_url));

%if “&next_url” ne “” %then %do;

%put crawling &next_url … ;

/* crawl the url */
filename _nexturl url “&next_url”;

/* put the file we crawled here */
filename htmlfile “file%trim(&html_num).html”;

/* find more urls */
data work._urls(keep=url);
length url $256 ;
file htmlfile;
infile _nexturl length=len;
input text $varying2000. len;

put text;

start = 1;
stop = length(text);

if _n_ = 1 then do;
retain patternID;
pattern = ‘/href=”([^”]+)”/i’;
patternID = prxparse(pattern);
end;

/* Use PRXNEXT to find the first instance of the pattern, */
/* then use DO WHILE to find all further instances. */
/* PRXNEXT changes the start parameter so that searching */
/* begins again after the last match. */
call prxnext(patternID, start, stop, text, position, length);
do while (position ^= 0);
url = substr(text, position+6, length-7);
* put url=;
output;
call prxnext(patternID, start, stop, text, position, length);
end;
run;

/* add the current link to the list of urls we have already crawled */
data work._old_link;
url = “&next_url”;
run;
proc append base=work.links_crawled data=work._old_link;
run;

/* only add urls that we haven’t already crawled or that aren’t queued up to be crawled */
proc sql noprint;
create table work._append as
select url
from work._urls
where url not in (select url from work.links_crawled)
and url not in (select url from work.links_to_crawl);
quit;

/* only add urls that are absolute (http://…) */
data work._append;
set work._append;
absolute_url = substrn(url, 1, 7);
put absolute_url=;
if absolute_url eq “http://” ;
drop absolute_url;
run;

/* add new links */
proc append base=work.links_to_crawl data=work._append force;
run;

/* increment our file number */
%let html_num = %eval(&html_num + 1);

/* loop */
%goto next_crawl;
%end;

%mend crawler;

%crawler();

Use PROC Univariate to get histogram by group

proc univariate data =sample.college NOPRINT;
class region type;
histogram tuition / normal (color=red)
cfill =itgray
ctext =blue;
inset N= ‘Number of colleges’ Median (8.2) mean (8.2) std = ‘Standard Deviation’ (8.3) / position = ne;
run;

The following histograms illustrate the distribution of tuition by region and type of college. You can get a general understanding that private colleges are much more expensive than the public colleges in certain region.

Proc sql to get fiscal year using put function

Dataset only has calendar year in numeric format. The following sql statement calculates the fiscal start year and fiscal end year based on the calendar month and year.Case statement requires the results of the ‘when’ clause is the same as the ‘then’ clause, therefor the put function can’t be used to convert bud_year to a character value.
proc sql;
create table step1 as
select A.*, B.proposed_amount, B.awarded_amount, B.period, B.start_dt as bud_start_dt, B.end_dt as bud_end_dt, B.bud_month, B.bud_year,
case
when B.bud_month>=5 then B.bud_year
when B.bud_month<5 then B.bud_year-1
else .
end as bud_fiscalyr_start,
case
when B.bud_month>=5 then B.bud_year+1
when B.bud_month<5 then B.bud_year
else .
end as bud_fiscalyr_end
from projectlist as A
left join project_budget as B
on B.project_id = A.project_id
;
quit;

step1

bud_month bud_year bud_fiscalyr_start bud_fiscalyr_end

4 2014 2013 2014

5                           2014               2014                                    2015
proc sql;
create table step2 (drop=bud_fiscalyr_start bud_fiscalyr_end) as
select *,
case
when put(bud_fiscalyr_start,4.)=’   .’ then ”
when put(bud_fiscalyr_start,4.) ne ‘   .’ then put(bud_fiscalyr_start,4.)||”-“||put(bud_fiscalyr_end,4.)
end as bud_fiscal_year
from step1
;
quit;

step2

bud_month bud_year bud_fiscal_year

4 2014 2013-2014

5 2014 2014-2015

. .

Survey basic frequency macro

This coding can be used to run

Case 1: Count frequency of distribution for one question and exclude a category from the counting results (eg. exclude records from the question with 88 as the answer in the frequency distribution).

%macro count (field=);

proc freq data=indata (where = (&field ne 88));
tables var1*(&field) ;
run;
%mend count;

%count (field=q1);

Case 2: Count frequency of distribution for one question conditional on the result of a different question. In the example, the where statement will delete records with q1 equals to 88 and then delete records with q10 not equal to 2 or 3 or 4. For the rest of the records, proc freq will run the distribution of q1.

%macro count (field1=, field2=);

proc freq data=indata (where = (&field1 ne 88 and &field2 in (2,3,4)));
tables var1*(&field1) ;
run;
%mend count;

%count (field1=q1, field2=q10);

Merge data with Data step

Because usually the information for the subject of interest are scattered in different datasets, it is common to join the tables to gather all the variables of the subject. There needs to be a common id field with the same data format in all the datasets that need to be merged.

Both dataset A and B have a variable id. Merge statement join the variables from A and B by the id variable in A. All the records and fields from A will be in the merged table and those records with equal id field from B will be added to the records in the merged table.

Caution:

Make sure id in A is unique
id field in both A and B should be the same type of field (character or numeric)
After merging the table, need to check if there is any duplicated information and why.

data merged;

merge A (in =a) B;

by id;

if a;

run;