SAS/IML Gallerie
Préambule
x 'cd "C:\Users\....\"'; /*to change the work directory*/
%let PROCHTTP_PROXY=http://proxy.univ-lemans.fr:3128; /* to add th proxy if needed */
Use a data from include in a R package in SAS
To be able to use R in SAS, t is often convenient to insert the -RLANG option in a SASV9.CFG (see SAS documentation ) .
Many R packages come with datasets that allow for conducting examples, while others like CASdatasets or insuranceDATA have the main purpose of making a collection of open datasets accessible. An example with R :
install.packages("insuranceData")
#install.packages("CASdatasets", repos = "http://cas.uqam.ca/pub/", type="source")
library(CASdatasets)
data(braggclaim)
data(ausprivauto0405)
library (insuranceData)
data("dataCar")
Use the same data in SAS :
PROC IML;
submit /R;
library(CASdatasets)
data(ausprivauto0405)
library (insuranceData)
data(dataOhlsson)
endsubmit;
call ImportDataSetFromR("work.AusPrivAuto0405","ausprivauto0405");
quit;
Extract all databases of a package R to SAS (output .sas file as DATA; ... DATALINES;...)
My macro accomplishes four steps to improve the previous section :
1. Imports the data "TN" from an R CRAN package to R.
2. Utilizes the PROC CONTENTS procedure to extract a list of variables and their types.
3. Generates a list of macro variables necessary for exporting to a SAS file.
4. Reads the data using DATALINES code and stores it in a SAS file.
*************************************************************;
* récupération sous SAS d'une base de données mis à disposition;
* dans un package R et exportation dans un fichier .sas ;
* de la forme data; ... datalines ;
*************************************************************;
* auteur : Martial Phélippé-Guinvarc'h;
* date : 09/09/2023;
*************************************************************;
%macro LCR(Pkg,TN);
proc iml;
pkg="&Pkg.";
tn="&TN.";
/* Assign the value of the parameter Table Name TN to a variable tn */
%include "Import submit R.sas";
/* Include the external SAS code from the "Import submit R.sas" file */
/* The following code is from the "Import submit R.sas" file */
*submit tn pkg /R;
*library (&pkg)
data(list=c("&tn"))
endsubmit;
/* THis code open the library and open the tn data */
/* to understand why %include statement is needed, see */
/* https://www.lexjansen.com/wuss/2014/76_Final_Paper_PDF.pdf */
call ImportDataSetFromR("&TN.", "&TN." );
/* Call the ImportDataSetFromR subroutine with the TN parameter twice */
quit;
proc contents data=&TN out=a noprint;
/* Use the CONTENTS procedure to retrieve metadata about the dataset specified by TN */
run ;
proc sort data=a ;
by varnum; /* needed to preserve the order of variable list */
proc sql noprint;
select name into: name_list separated by ' "|" ' from a ;
select name into: name separated by ' ' from a ;
/* Retrieve the column names from the dataset and store them in the macro variable name_list */
select count(name) into:n from a ;
/* Count the number of columns in the dataset and store the count in the macro variable n */
select compress(name || case(type) when 1 then "" else "$" end ||
case(FORMAT) when "DATE" then ":DATE9."
when "DATETIME" then ":DATETIME18." else "" end) into: name_listType separated by " " from a ;
select compress( case(type) when 1 then "" else name || compress("$" || put(length,3.)) end ) into: name_Char separated by " " from a ;
select compress( case(FORMAT) when "DATE" then NAME ||":DATE9."
when "DATETIME" then NAME ||":DATETIME18." else "" end) into: name_DATE separated by " " from a ;
/* Create a macro variable name_listType containing column names with a $ suffix for non-numeric columns */
quit;
/* control if needed */
*%put &name_list;
*%put &n;
*%put "0_&TN..sas";
/* create a copie of the table in a sas code file*/;
Filename F "..\..\ActuariatNonVie_M2\SAS\0_&TN..sas";
/* Define a file reference named F with a specific file path */
DATA _null_;
FORMAT &name_date ;
SET &TN end=eof;
/* Start a DATA step with a SET statement to read data from the dataset specified by TN */
FILE F;
/* Specify the file reference F for output */
if _n_=1 then do;
/* Check if the current iteration is the first iteration */
PUT 'data ' "&TN." ' (LABEL="Source CRAN CASdatasets ' "&TN." '");';
PUT "retain " "&name." ";"; /* needed to preserve the order of variable list */
PUT "LENGTH " "&name_char." ";";
put 'infile datalines dsd dlm="|" truncover ;';
/* Write a data step header with a label */
PUT "INPUT &name_listType.;";
PUT "FORMAT " "&name_date." ";";
/* Write an INPUT statement for reading column values */
PUT "DATALINES;" ;
/* Write a DATALINES statement to indicate the start of data lines */
end;
PUT &name_list ;
/* Write the column names to the output file */
IF eof then put ";;;;;;;;;;;;run;";
RUN;
%mend;
/* End of the LCR macro definition */
%LCR(CASdatasets,auscathist);
Please note that the R code `data(package = "PackageName")$results` provides a list of available datasets in a R package.
Then, the following R code allows us to read the SAS file that extracts all databases available in the R package using the previous macro.
write.csv(paste("%LCR(CASdatasets,",as.data.frame(data(package = "CASdatasets")$results)$Item,")"), file=" LCR.sas", row.names = FALSE)
Please note that the command in windows command prompt `type 0_*.sas -> 00_BasesInsuranceFromR.sas` allows us to concatenate all SAS files produced by the macro into a single SAS file named "00_BasesInsuranceFromR.sas".
Exercice de web scraping sur SAS : obtenir le cahier des charges de chaque appellation viticole en pdf
******* Author : Martial Phélippé-Guinvarc'h;
******* Date 22/06/2021;
******** Obtenir le cahier des charges de chaque appellation viticole en pdf ;
**** Il s'agit en premier lieu de lister les appellations, disponibles sur data.gouv.fr***;
filename aoc "datagouv-listesiqopublies.csv";
proc HTTP
url="https://static.data.gouv.fr/resources/siqo-publies/20210608-095222/2021-06-08-datagouv-listesiqopublies.csv"
method="get"
out=aoc;
run;
proc import datafile=aoc
dbms=csv out=aoc(where =(ID_TYPE_PRODUIT between 2 and 3)keep=appellation ID_TYPE_PRODUIT) replace;
delimiter=";";
;
run;
*** le filtre sur id du produit permet de ne prendre que les cahier des charges viticoles;
proc sort data=aoc nodup;
by Appellation;
run;
**** Une recherche sur Qwant, google ou Yahoo permet;
**** de lister les sites qui fournissent les cahiers des charges;
**** On peut utiliser la proc html mais il est plus simple de faire une data infile;
**** Google permet de cibler les pdf directement mais l'import ne contient pas la liste des résultats;
**** Qwant permet de charger la liste des résultats sans bug;
**** Via une data _null_, j'écris un code SAS d'importation pour chaque appellation;
filename websrap 'webscrap.sas'; * in a permanent file;
data _null_;
set work.aoc end=eof;
file websrap;
* Begin step;
g=quote(compress("https://www.qwant.com/?l=fr&t=web&q=cahier+charges+agriculture+telecharger+le+pdf+aoc+"|| quote(Appellation,"'")));
put "filename x url" ;
put g ";";
put "data websrap(where=(line contains 'div domain='";
put "and line contains 'https://info.agriculture.gouv.fr'));";
put "infile x dsd dlm='<>' lrecl=32767 encoding='utf-8';";
put "input line : $800. @@;";
put "run;";
if _n_ = 1 then
put "data Links; set websrap; run;";
else put "data Links; set Links websrap; run;";
/*if eof then put "; ";*/
*run;
run;
**** parmi tous les sites, le plus fiable est probablement https://info.agriculture.gouv.fr ;
**** se limiter à un seul site, s'il est exhaustif (il affiche une antériorité de 2014), évite de ;
**** télécharger plusieurs fois le même cdc sous des noms différents ;
**** ce site est bien structuré, le lien entre du pdf est presque le même que le lien de la page;
**** il suffit d'ajouter /telechargement;
%include 'webscrap.sas';
data Links;
set Links;
Link=quote(compress(scan(line,2,'"')||"/telechargement"));
run;
*** on supprime les doublons pour ne pas télécharger le même cdc;
proc sort data=Links nodupkey;
by link;
run;
**** Via une data _null_, j'écris de nouveau un code SAS d'importation mais cette fois du pdf;
filename websrapi 'webscrapi.sas'; * in a permanent file;
data _null_;
retain a 0;
set work.Links end=eof;
a+1;
name= quote(compress("AOC" || a ||".pdf"));
file websrapi;
* Begin step;
put "filename pdfs " name ";";
put "proc HTTP url=" link;
put 'method="get" ';
put "out= pdfs ;";
put "run;";
run;
%include 'webscrapi.sas';
***** end;
API Hubeau Hydrologie
*************************************************************;
* récupération sous SAS la base de données mise à disposition par Hubeau;
* auteur : Martial Phélippé-Guinvarc'h;
* date : 15/09/2022;
* https://hubeau.eaufrance.fr/;
/*proxy*/
%let PROCHTTP_PROXY=http://localhost:port; * à adapter;
libname Hub "D:\";
*************************************************************;
* import de la liste des stations en json;
filename Hubeau temp ;
proc http
url="https://hubeau.eaufrance.fr/api/v1/hydrometrie/referentiel/stations?format=json&size=9000"
method="GET" out=hubeau;
run;
libname LShub JSON fileref=hubeau;
data ListSite(drop= ordinal_root );
format d_o d_f date9. a best6.;
set LShub.data;
*** convertion des variables caractères ;
array Lib libelle_:;
do over Lib;;
Lib=kcvt(Lib,'utf8','latin1');
end;
*** convertion des dates d ouverture et fermeture;
d_o=input(substr(date_ouverture_station,1,10),YYMMDD10.);
d_f=input(substr(date_fermeture_station,1,10),YYMMDD10.);
if en_service=1 then d_f=today();
a=1;
*** la limite de téléchargement étant 20 000 lignes,
si l'écart entre l'ouverture et la fermeture >20000 alors,
la ligne est dupliquée en période consécutive de 20000 jours;
if d_f-d_o>20000 then do;
output;d_o=d_o+20001;a=2; end;
if d_f-d_o>20000 then do;
output;d_o=d_o+20001;a=3; end;
output;
run;
data ListSite;
set ListSite;
* reconvertion des dates en texte;
date_ouverture_station=put(d_o,YYMMDD10.);
date_fermeture_station=put(min(d_f,d_o+20000),YYMMDD10.);
run;
proc sql;
select count(code_station) into:Nbstation from ListSite;
quit;
************************************************************;
*Initialisation de la data;
Data Hydrometrie;
format grandeur_hydro_elab $3. code_station $12. date_obs_elab $10. resultat_obs_elab Best12.;
set _null_;
run;
*** prefix de l'api;
%let o = %str(https://hubeau.eaufrance.fr/api/v1/hydrometrie/obs_elab?fields=code_station,grandeur_hydro_elab,date_obs_elab,resultat_obs_elab&format=json&size=20000);
%put &o;
%macro importer_tout;
*la macro charge les données par station et par intervalle
max de 20000 j;
%do i=1 %to &Nbstation ;
%put &i.;
filename Hubeau temp;
*** construction du lien entier;
proc sql noprint;
select compress("&o."||"&code_entite=" ||code_station||"&date_debut_obs_elab="||date_fermeture_station)
into:linkstation
from ListSite where ordinal_data=&i;
select code_station into:st
from ListSite where ordinal_data=&i;
quit;
*** téléchargement;
proc http url="&linkstation." method="GET" out=hubeau;
run;
%put &linkstation.;
libname Eaufr JSON fileref=hubeau;
*incrémentation dans la data Hydrometrie;
proc datasets library=EauFr nolist;
append data=data(drop=ordinal_root ordinal_data
where=(code_station="&st.")) base=work.Hydrometrie;
run;
%end;
%mend;
%importer_tout;
proc freq data=work.hydrometrie;
table code_station;
run;
proc sort data=work.hydrometrie out=Hub.hydrometrie;
by code_station date_obs_elab;
run;