SAS/IML Gallerie

Préambule

x 'cd "C:\Users\....\"'; /*to change the work directory*/

%let PROCHTTP_PROXY=http://proxy.univ-lemans.fr:3128; /* to add th proxy if needed */

Use a data from include in a R package in SAS

To be able to use R in SAS, t is often convenient to insert the -RLANG option in a SASV9.CFG (see SAS documentation ) .

Many R packages come with datasets that allow for conducting examples, while others like CASdatasets or insuranceDATA have the main purpose of making a collection of open datasets accessible.  An example with R :

install.packages("insuranceData")
#install.packages("CASdatasets", repos = "http://cas.uqam.ca/pub/", type="source")
library(CASdatasets)
data(braggclaim)


data(ausprivauto0405)

library (insuranceData)

data("dataCar")

Use the same data in  SAS :

PROC IML;
submit /R;
library(CASdatasets)
data(ausprivauto0405)

library (insuranceData)
data(dataOhlsson)
endsubmit;

call ImportDataSetFromR("work.AusPrivAuto0405","ausprivauto0405");
quit;


Extract all databases of a package R to SAS (output .sas file as DATA; ... DATALINES;...)

 My macro accomplishes four steps to improve the previous section :

1. Imports the data "TN" from an R CRAN package to R.

2. Utilizes the PROC CONTENTS procedure to extract a list of variables and their types.

3. Generates a list of macro variables necessary for exporting to a SAS file.

4. Reads the data using DATALINES code and stores it in a SAS file.




*************************************************************;

* récupération sous SAS d'une base de données mis à disposition;

* dans un package R et exportation dans un fichier .sas ;

* de la forme data; ... datalines ;

*************************************************************;

* auteur : Martial Phélippé-Guinvarc'h;

* date : 09/09/2023;

*************************************************************;

%macro LCR(Pkg,TN);

proc iml;

pkg="&Pkg.";

tn="&TN.";

/* Assign the value of the parameter Table Name TN to a variable tn */

%include "Import submit R.sas";

/* Include the external SAS code from the "Import submit R.sas" file */

/* The following code is from the "Import submit R.sas" file */

*submit tn pkg /R;

*library (&pkg)

data(list=c("&tn"))

endsubmit;

/* THis code open the library and open the tn data */

/* to understand why %include statement is needed, see */

/* https://www.lexjansen.com/wuss/2014/76_Final_Paper_PDF.pdf */

 

call ImportDataSetFromR("&TN.", "&TN." );

/* Call the ImportDataSetFromR subroutine with the TN parameter twice */

quit;

 

proc contents data=&TN out=a noprint;

/* Use the CONTENTS procedure to retrieve metadata about the dataset specified by TN */

run ;

proc sort data=a ;

by varnum; /* needed to preserve the order of variable list */

 

proc sql noprint;

select name into: name_list separated by ' "|" ' from a ;

select name into: name separated by ' ' from a ;

/* Retrieve the column names from the dataset and store them in the macro variable name_list */

select count(name) into:n  from a ;

/* Count the number of columns in the dataset and store the count in the macro variable n */

select compress(name || case(type) when 1 then "" else "$"  end || 

case(FORMAT) when "DATE" then ":DATE9." 

    when "DATETIME" then ":DATETIME18." else "" end) into: name_listType separated by " " from a ;

select compress(  case(type) when 1 then "" else name || compress("$" || put(length,3.)) end ) into: name_Char separated by " " from a ;

select compress( case(FORMAT) when "DATE" then NAME ||":DATE9." 

  when "DATETIME" then NAME ||":DATETIME18." else "" end) into: name_DATE separated by " " from a ;

/* Create a macro variable name_listType containing column names with a $ suffix for non-numeric columns */

quit;

 

/* control if needed */

*%put &name_list;

*%put &n;

*%put "0_&TN..sas";

 

/* create a copie of the table in a sas code file*/;

Filename F "..\..\ActuariatNonVie_M2\SAS\0_&TN..sas";

/* Define a file reference named F with a specific file path */

DATA _null_;

FORMAT &name_date ;

SET &TN end=eof;

 

/* Start a DATA step with a SET statement to read data from the dataset specified by TN */

FILE F;

/* Specify the file reference F for output */

 

if _n_=1 then do;

/* Check if the current iteration is the first iteration */

PUT 'data ' "&TN." ' (LABEL="Source CRAN CASdatasets ' "&TN." '");';

PUT "retain " "&name." ";";  /* needed to preserve the order of variable list */

PUT "LENGTH " "&name_char." ";";

put 'infile datalines dsd dlm="|" truncover ;';

/* Write a data step header with a label */

PUT "INPUT &name_listType.;"

PUT "FORMAT " "&name_date." ";";

/* Write an INPUT statement for reading column values */

PUT "DATALINES;" ;

/* Write a DATALINES statement to indicate the start of data lines */

end;

PUT  &name_list ;

/* Write the column names to the output file */

IF eof then put ";;;;;;;;;;;;run;";

RUN;  

%mend;

/* End of the LCR macro definition */

 

%LCR(CASdatasets,auscathist);


Please note that the R code `data(package = "PackageName")$results` provides a list of available datasets in a R package.

Then, the following R code allows us to read the SAS file that extracts all databases available in the R package using the previous macro.

write.csv(paste("%LCR(CASdatasets,",as.data.frame(data(package = "CASdatasets")$results)$Item,")"), file=" LCR.sas", row.names = FALSE)

Please note that the command  in windows command prompt `type 0_*.sas -> 00_BasesInsuranceFromR.sas` allows us to concatenate all SAS files produced by the macro into a single SAS file named "00_BasesInsuranceFromR.sas".

Exercice de web scraping sur SAS : obtenir le cahier des charges de chaque appellation viticole en pdf 

******* Author : Martial Phélippé-Guinvarc'h;

******* Date 22/06/2021;

******** Obtenir le cahier des charges de chaque appellation viticole en pdf ;

**** Il s'agit en premier lieu de lister les appellations, disponibles sur data.gouv.fr***;


filename aoc "datagouv-listesiqopublies.csv";

 

proc HTTP   

url="https://static.data.gouv.fr/resources/siqo-publies/20210608-095222/2021-06-08-datagouv-listesiqopublies.csv"

method="get" 

out=aoc;   

run;

proc import datafile=aoc

dbms=csv out=aoc(where =(ID_TYPE_PRODUIT between 2 and 3)keep=appellation ID_TYPE_PRODUIT) replace;

delimiter=";";

;

run;

*** le filtre sur id du produit permet de ne prendre que les cahier des charges viticoles;

 

proc sort data=aoc nodup;

by Appellation;

run;

 

**** Une recherche sur Qwant, google ou Yahoo permet;

**** de lister les sites qui fournissent les cahiers des charges;

**** On peut utiliser la proc html mais il est plus simple de faire une data infile;

**** Google permet de cibler les pdf directement mais l'import ne contient pas la liste des résultats;

**** Qwant permet de charger la  liste des résultats sans bug;

 

 

**** Via une data _null_, j'écris un code SAS d'importation pour chaque appellation;

 

filename websrap 'webscrap.sas'; * in a permanent file;

data _null_;

set work.aoc end=eof;

file websrap;

* Begin step;

g=quote(compress("https://www.qwant.com/?l=fr&t=web&q=cahier+charges+agriculture+telecharger+le+pdf+aoc+"|| quote(Appellation,"'")));

put "filename x url" ;

put  g ";";

put "data websrap(where=(line contains  'div domain='";

put "and line contains 'https://info.agriculture.gouv.fr'));";

put "infile x dsd dlm='<>' lrecl=32767 encoding='utf-8';";

put "input  line : $800. @@;";

put "run;";

 

if _n_ = 1 then 

put "data Links; set  websrap; run;";

else put "data Links; set Links websrap; run;";

/*if eof then put "; ";*/

*run;

run;

 

**** parmi tous les sites, le plus fiable est probablement https://info.agriculture.gouv.fr ;

**** se limiter à un seul site, s'il est exhaustif (il affiche une antériorité de 2014), évite de  ;

**** télécharger plusieurs fois le même cdc sous des noms différents ;

**** ce site est bien structuré, le lien entre du pdf est presque le même que le lien de la page;

**** il suffit d'ajouter /telechargement;

 

 

%include 'webscrap.sas';

 

data Links;

set Links;

Link=quote(compress(scan(line,2,'"')||"/telechargement"));

run;

 

*** on supprime les doublons pour ne pas télécharger le même cdc;

proc sort data=Links nodupkey;

by link;

run;

 

 

**** Via une data _null_, j'écris de nouveau un code SAS d'importation mais cette fois du pdf;

 

filename websrapi 'webscrapi.sas'; * in a permanent file;

data _null_;

retain a 0;

set work.Links end=eof;

a+1;

name= quote(compress("AOC" || a ||".pdf"));

file websrapi;

* Begin step;

put "filename pdfs " name ";";

put "proc HTTP url=" link;

put 'method="get" ';

put "out= pdfs ;";   

put "run;";

run;

 

%include 'webscrapi.sas';

 

***** end; 


API Hubeau Hydrologie



*************************************************************;

* récupération sous SAS la base de données mise à disposition par Hubeau;

* auteur : Martial Phélippé-Guinvarc'h;

* date : 15/09/2022;

* https://hubeau.eaufrance.fr/;

/*proxy*/

 %let PROCHTTP_PROXY=http://localhost:port; * à adapter;

libname Hub "D:\";

 *************************************************************;

 * import de la liste des stations en json;

filename Hubeau temp ;

proc http

url="https://hubeau.eaufrance.fr/api/v1/hydrometrie/referentiel/stations?format=json&size=9000"

method="GET" out=hubeau;

run;

libname LShub JSON fileref=hubeau;

 

 

data ListSite(drop= ordinal_root );

format d_o d_f date9. a best6.;

set LShub.data;

*** convertion des variables caractères ;

array Lib libelle_:; 

do over Lib;;

Lib=kcvt(Lib,'utf8','latin1');

end;

*** convertion des dates d ouverture et fermeture;

d_o=input(substr(date_ouverture_station,1,10),YYMMDD10.);

d_f=input(substr(date_fermeture_station,1,10),YYMMDD10.);

if en_service=1 then d_f=today();

a=1;

 

*** la limite de téléchargement étant 20 000 lignes,

si l'écart entre l'ouverture et la fermeture >20000 alors,

la ligne est dupliquée en période consécutive de 20000 jours;

 

if d_f-d_o>20000 then do;

output;d_o=d_o+20001;a=2end;

if d_f-d_o>20000 then do;

output;d_o=d_o+20001;a=3end;

output;

 

run;

 

 

data ListSite;

set ListSite;

* reconvertion des dates en texte;

date_ouverture_station=put(d_o,YYMMDD10.);

date_fermeture_station=put(min(d_f,d_o+20000),YYMMDD10.);

 

run;

 

proc sql;

select count(code_station) into:Nbstation from ListSite;

quit;

 

 

************************************************************;

*Initialisation de la data;

 

Data Hydrometrie;

format grandeur_hydro_elab $3. code_station $12. date_obs_elab $10. resultat_obs_elab Best12.;

set _null_;

run;

 

*** prefix de l'api;

%let o = %str(https://hubeau.eaufrance.fr/api/v1/hydrometrie/obs_elab?fields=code_station,grandeur_hydro_elab,date_obs_elab,resultat_obs_elab&format=json&size=20000);

%put &o;

 

%macro importer_tout;

*la macro charge les données par station et par intervalle

max de 20000 j;

 

%do i=1 %to &Nbstation ;

 

%put &i.;

 

filename Hubeau temp;

 

*** construction du lien entier;

proc sql noprint;

select compress("&o."||"&code_entite=" ||code_station||"&date_debut_obs_elab="||date_fermeture_station)

             into:linkstation

  from ListSite where ordinal_data=&i;

select code_station into:st

  from ListSite where ordinal_data=&i;

quit;

 

*** téléchargement;

proc http url="&linkstation." method="GET" out=hubeau;

run;

 

%put &linkstation.;

libname Eaufr JSON fileref=hubeau;

 

*incrémentation dans la data Hydrometrie;

proc datasets library=EauFr nolist;

append data=data(drop=ordinal_root ordinal_data

where=(code_station="&st.")) base=work.Hydrometrie;

run;

%end;

%mend;

 

%importer_tout;

 

proc freq data=work.hydrometrie;

table code_station;

run;

 

proc sort data=work.hydrometrie out=Hub.hydrometrie;

by code_station date_obs_elab;

run;