https://github.com/Azure/Azure-TDSP-Utilities
http://haifengl.github.io/smile/index.html
http://www.eviews.com/home.html
https://en.wikipedia.org/wiki/Bayesian_programming
https://www.rstudio.com/resources/cheatsheets/
https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf
http://minimaxir.com/2016/12/interactive-network/
http://mlr-org.github.io/mlr-tutorial/devel/html/ ML in R
http://vitalflux.com/cheat-sheet-10-machine-learning-algorithms-r-commands/
http://shinyapps.org/apps/RGraphCompendium
https://speakerdeck.com/hadley/eight-visualisation-challenges-with-ggplot2
https://habrahabr.ru/post/312632/
https://github.com/thomasp85/ggraph
http://rmarkdown.rstudio.com/flexdashboard/
https://cran.rstudio.com/web/packages/dplyr/vignettes/introduction.html
http://yokekeong.com/bay-area-user-group-meetup-video/
https://github.com/yoke2/dsxref Python and R examples
generate random numbers of the normal distribution
R:
set.seed(39) # generates 10 values with mean = 0, standard deviation = 1 rnorm(10, mean = 0, sd = 1)
Python:
import numpy as np np.random.seed(seed=39) # generates 10 values with mean = 0, standard deviation = 1a = np.random.randn(10) print a
R:
set.seed(39) # generates 10 values with mean = 0, standard deviation = 1 runif(10, min = 0, max = 1)
Python:
import numpy as np np.random.seed(seed=39) # generates 10 values with mean = 0, standard deviation = 1a = np.random.uniform(low=0.0, high=1.0, size=10) print a
https://www.rstudio.com/resources/cheatsheets/
http://r-analytics.blogspot.ru/
https://www.analyticsvidhya.com/blog/2016/02/complete-tutorial-learn-data-science-scratch/
http://rmarkdown.rstudio.com/flexdashboard/
https://habrahabr.ru/post/309350/
https://habrahabr.ru/post/309246/
https://habrahabr.ru/post/309420/
https://habrahabr.ru/post/310108/
https://habrahabr.ru/company/infopulse/blog/310288/ clustering
https://habrahabr.ru/post/311168/ clustering
http://blog.revolutionanalytics.com/2016/09/fraud-detection.html
https://www.quora.com/session/Hadley-Wickham/1
http://blog.revolutionanalytics.com/2016/09/pirates-guide-to-r.html
https://github.com/hadley/r4ds
http://rmarkdown.rstudio.com/index.html
https://habrahabr.ru/post/310472/
http://varianceexplained.org/RData/
https://news.ycombinator.com/item?id=12144325
http://www.kdnuggets.com/2015/03/the-grammar-data-science-python-vs-r.html
https://habrahabr.ru/company/infopulse/blog/306184/
https://github.com/swirldev/swirl_courses#swirl-courses
https://www.coursera.org/course/rprog
https://www.coursera.org/course/predmachlearn
https://www.coursera.org/course/exdata
Time Series
http://en.wikipedia.org/wiki/Decomposition_of_time_series
http://a-little-book-of-r-for-time-series.readthedocs.org/en/latest/src/timeseries.html
http://www.gistatgroup.com/gus/
http://habrahabr.ru/post/271265/
http://www.r-bloggers.com/the-guerilla-guide-to-r/
http://blog.safaribooksonline.com/2013/11/14/primer-on-data-visualization-intro-to-r/
Any time series without a constant mean over time is nonstationary.
t <-read.table("c:\\bidgely\\data_with_header.txt", header=TRUE, sep=",")
> names(t)
[1] "unixtime" "energy" "weekday" "month" "day" "hhmmss" "year" "hour"
[9] "min"
x=subset(t$hhmmss,t$day==1 & t$hour < 4)
y=subset(t$energy,t$day==1 & t$hour < 4)
plot(y)
t <-read.table("c:\\bidgely\\data.txt", header=FALSE, sep=",")
names(t)
range(t$V1)
[1] 1312182000 1314860399
range(t$V2)
[1] 34 14512
plot(t$V1,t$V2)
get every 10 nth element in range
n_x <- t$V1[c(1,500:10)]
n_y <- t$V1[c(1,500:10)]
plot (n_x,n_y)
exclude_refr = subset(t, t$V2 > 1500)
hist(exclude_refr$V2)
HISTOGRAM
http://msenux.redwoods.edu/math/R/hist.php
http://www.statmethods.net/graphs/density.html
BAR CHART
http://www.harding.edu/fmccown/r/
http://www.stattler.com/article/some-useful-bar-plots-using-r
http://stackoverflow.com/questions/8148956/how-to-create-grouped-barplot-with-r
hist(subset(t$V2, t$V2 < 1000))
hist(subset(t$V2, t$V2 > 1000 & t$V2 < 10000))
hist(subset(t$V2, t$V2 > 10000))
http://www.ling.upenn.edu/~joseff/rstudy/summer2010_basics.html
min(t$V2)
max(t$V2)
range(t$V2)
mean(t$V2)
Plotting subset for single day
day=24*60*60
plot(subset(t, t$V1 < min(t$V1)+day))
end=max(t$V1)
> end
[1] 1314860399
start=min(t$V1)
> start
[1] 1312182000
plot(subset(t, t$V1 > end - day/4 & t&V1 < end))
plot(subset(t, t$V1 < start +day /4 )) -- this plot shows the trend
plot(subset(t, t$V1 < start +day & t$V2 < 1000))
plot(subset(t, t$V1 < start +day /4 & t$V2 > 5000)) -- this plot shows the trend
Converting unix timestamp to calendar day
unix2POSIXct <- function (time) structure(time, class =
c("POSIXt", "POSIXct"))
> unix2POSIXct(1314860375)
[1] "2011-08-31 23:59:35 PDT"
http://stackoverflow.com/questions/8293547/how-to-plot-a-subset-of-a-data-frame-in-r
--------------------------------------------------------------------
dat <- data.frame(temperature = seq(-20, 40, by = 1),
altitude = seq(50, 2500, length = 61))
plot(dat)
plot(altitude ~ temperature, data = dat, subset = temperature >= 0)
plot(dat)
http://www.slideshare.net/jeffreybreen/grouping-summarizing-data-in-r
https://code.google.com/p/sqldf/
http://stackoverflow.com/questions/6370383/whats-the-r-way-to-do-the-following-group-by
------------------------------------------------------
create table d(timestamp int, value int, weekday varchar(4), month varchar(4), day int, time varchar(10),year int, hour int, minute int);
truncate table d;
IMPORT FROM CSV FILE 'C:\\BIDGELY\\data_py2.txt' INTO d WITH RECORD DELIMITED BY '\n' FIELD DELIMITED BY ',';
select ' <= 300', count(*) from test where value <= 300
union
select '>300 <=700', count(*) from test where value > 300 AND value <= 700
union
select '>700 <=1000', count(*) from test where value > 700 AND value <= 1000
union
select '>1000 <=2000', count(*) from test where value > 1000 AND value <= 2000
union
select '>2000 <=3000', count(*) from test where value > 2000 AND value <= 3000
union
select '>3000 <=4000', count(*) from test where value > 3000 AND value <= 4000
union
select '>4000 <=5000', count(*) from test where value > 4000 AND value <= 5000
union
select '>5000 <=6000', count(*) from test where value > 5000 AND value <= 6000
union
select '>6000 <=7000', count(*) from test where value > 6000 AND value <= 7000
union
select '>7000 <=8000', count(*) from test where value > 7000 AND value <= 8000
union
select '>8000 <=9000', count(*) from test where value > 8000 AND value <= 9000
union
select '>9000 <=10000', count(*) from test where value > 9000 AND value <= 10000
union
select '>10000 <=11000', count(*) from test where value > 10000 AND value <= 11000
union
select '>11000 <=12000', count(*) from test where value > 11000 AND value <= 12000
union
select '>12000 ', count(*) from test where value > 12000
----------------------------------------------------------------------
select hour, sum(value) from d group by hour order by hour;
select day, count(*), sum(value) from data group by day order by day;
select weekday, count(*), sum(value) from data group by weekday order by weekday;
------------------------------------------------------------------------
import time
file="data.txt"
#file="data_small.txt"
fin=open(file,'r')
for line in fin:
(unixtime,value) =line.rstrip().split(',')
calendar=time.ctime((float(unixtime)))
#print calendar, calendar[11:13], calendar[14]
cal=calendar.replace(' ',',')
hour=calendar[11:13]
min=calendar[14]
print"%s,%s,%s,%s,%s" % (unixtime,value,cal,hour,min)
fin.close()
R
http://www.statmethods.net/advgraphs/layout.html several plots layout par() mfrow()
http://www.stat.auckland.ac.nz/~paul/Talks/Rgraphics.pdf
https://sites.google.com/site/r4statistics/
http://learnr.wordpress.com/ R language
http://had.co.nz/ggplot2/book/
Script to produce barplot
File: 0.txt
-6 3318085 22316093102
-5 380604717 1808012186266
-4 133078 1827654758
-3 14970348 72388068541
-1 510845988 990690275168
0 9793840 54337511506
1 10779447 60064465195
2 207670 811929902
3 49057 165876044
6 1379432 7225461223
7 5739374 23773842841
8 25149155 99311740610
9 4695941 44477979918
10 4035427 13287655789
12 984193 3325133371
zero <- read.table("0.txt")
mycolors=c("darkblue","red")
barplot(as.matrix(rbind(100*zero$V2/sum(zero$V2),100*zero$V3/sum(zero$V3))), names.arg = zero$V1, xlab="Timeshift in hours ( -1 corresponds to src_pty=NULL) ",ylab="% from total", main="% of total records (blue) and % of total size(red) per timeshift 2011/02/27 00:00", beside=TRUE ,col=mycolors)
legend("topright", c("%Records","%Size"), cex=0.6, bty="n", fill=mycolors)
----------------------------------
mycolors=rainbow(7)
all <- read.table("0.txt")
step1000 <- read.table("00-step1000.txt")
rand1000 <- read.table("00-rand1000.txt")
step10000 <- read.table("00-step10000.txt")
rand10000 <- read.table("00-rand10000.txt")
stepmln <- read.table("00-step-mln.txt")
randmln <- read.table("00-rand-mln.txt")
barplot(as.matrix(rbind(100*all$V3/sum(all$V3), 100*step1000$V3/sum(as.numeric(step1000$V3)), 100*rand1000$V3/sum(as.numeric(rand1000$V3)), 100*step10000$V3/sum(as.numeric(step10000$V3)), 100*rand10000$V3/sum(as.numeric(rand10000$V3)), 100*stepmln$V3/sum(as.numeric(stepmln$V3)), 100*randmln$V3/sum(as.numeric(randmln$V3)))), names.arg = step1000$V1, xlab="Timeshift in hours ",ylab="% from total ABF1 feed size", main="ABF1 1 hour feed: % of total size per timeshift 2011/02/27 00:00", beside=TRUE ,col=mycolors)
legend("topright", c("Counting size of all records", "Sampling step=1000", "Sampling randomly 1000", "Sampling step=10,000", "Sampling randomly 10,000", "Sampling step=1,000,000", "Sampling randomly 1,000,000"), cex=0.6, bty="n", fill=mycolors )
--------------------------------
barplot(zero$V3/1000000000, names.arg = zero$V1, xlab="Timeshift in hours ( -1 corresponds to src_pty=NULL) ",ylab="GB", main="GB per timeshift for /data/FETL/ABF1_SEQ/201102270000/" )
----------------------------------------
mycolors=rainbow(5)
barplot( 100*n$V2/sum(n$V2), names.arg = n$V1, col=mycolors, xlab="filterTag ",ylab="% of records with src_pty = NULL", main =" ABF1 records distibution for src_pty = NULL" )
-----------------------------------