R

generate random numbers of the normal distribution

set.seed(39) # generates 10 values with mean = 0, standard deviation = 1 rnorm(10, mean = 0, sd = 1)

Python:

import numpy as np np.random.seed(seed=39) # generates 10 values with mean = 0, standard deviation = 1a = np.random.randn(10) print a

How do I generate random numbers of the uniform distribution?

set.seed(39) # generates 10 values with mean = 0, standard deviation = 1 runif(10, min = 0, max = 1)

Python:

import numpy as np np.random.seed(seed=39) # generates 10 values with mean = 0, standard deviation = 1a = np.random.uniform(low=0.0, high=1.0, size=10) print a

https://www.rstudio.com/resources/cheatsheets/

http://r-analytics.blogspot.ru/

https://www.analyticsvidhya.com/blog/2016/02/complete-tutorial-learn-data-science-scratch/

http://rmarkdown.rstudio.com/flexdashboard/

https://habrahabr.ru/post/309350/

https://habrahabr.ru/post/309246/

https://habrahabr.ru/post/309420/

https://habrahabr.ru/post/310108/

https://habrahabr.ru/company/infopulse/blog/310288/ clustering

https://habrahabr.ru/post/311168/ clustering

http://blog.revolutionanalytics.com/2016/09/fraud-detection.html

https://www.quora.com/session/Hadley-Wickham/1

http://blog.revolutionanalytics.com/2016/09/pirates-guide-to-r.html

http://r4ds.had.co.nz/

https://github.com/hadley/r4ds

http://adv-r.had.co.nz/

http://rmarkdown.rstudio.com/index.html

https://habrahabr.ru/post/310472/

http://varianceexplained.org/RData/

https://news.ycombinator.com/item?id=12144325

http://www.kdnuggets.com/2015/03/the-grammar-data-science-python-vs-r.html

https://habrahabr.ru/company/infopulse/blog/306184/

https://github.com/swirldev/swirl_courses#swirl-courses

http://shiny.rstudio.com/

https://www.coursera.org/course/rprog

https://www.coursera.org/course/predmachlearn

https://www.coursera.org/course/exdata

Time Series

http://en.wikipedia.org/wiki/Decomposition_of_time_series

http://a-little-book-of-r-for-time-series.readthedocs.org/en/latest/src/timeseries.html

http://www.gistatgroup.com/gus/

http://habrahabr.ru/post/271265/

http://www.r-bloggers.com/the-guerilla-guide-to-r/

http://blog.safaribooksonline.com/2013/11/14/primer-on-data-visualization-intro-to-r/

Any time series without a constant mean over time is nonstationary.

t <-read.table("c:\\bidgely\\data_with_header.txt", header=TRUE, sep=",")

> names(t)

[1] "unixtime" "energy" "weekday" "month" "day" "hhmmss" "year" "hour"

[9] "min"

x=subset(t$hhmmss,t$day==1 & t$hour < 4)

y=subset(t$energy,t$day==1 & t$hour < 4)

plot(y)

t <-read.table("c:\\bidgely\\data.txt", header=FALSE, sep=",")

names(t)

range(t$V1)

[1] 1312182000 1314860399

range(t$V2)

[1] 34 14512

plot(t$V1,t$V2)

get every 10 nth element in range

n_x <- t$V1[c(1,500:10)]

n_y <- t$V1[c(1,500:10)]

plot (n_x,n_y)

exclude_refr = subset(t, t$V2 > 1500)

hist(exclude_refr$V2)

HISTOGRAM

http://msenux.redwoods.edu/math/R/hist.php

http://www.statmethods.net/graphs/density.html

BAR CHART

http://www.harding.edu/fmccown/r/

http://www.stattler.com/article/some-useful-bar-plots-using-r

http://stackoverflow.com/questions/8148956/how-to-create-grouped-barplot-with-r

hist(subset(t$V2, t$V2 < 1000))

hist(subset(t$V2, t$V2 > 1000 & t$V2 < 10000))

hist(subset(t$V2, t$V2 > 10000))

http://www.ling.upenn.edu/~joseff/rstudy/summer2010_basics.html

min(t$V2)

max(t$V2)

range(t$V2)

mean(t$V2)

Plotting subset for single day

day=24*60*60

plot(subset(t, t$V1 < min(t$V1)+day))

end=max(t$V1)

> end

[1] 1314860399

start=min(t$V1)

> start

[1] 1312182000

plot(subset(t, t$V1 > end - day/4 & t&V1 < end))

plot(subset(t, t$V1 < start +day /4 )) -- this plot shows the trend

plot(subset(t, t$V1 < start +day & t$V2 < 1000))

plot(subset(t, t$V1 < start +day /4 & t$V2 > 5000)) -- this plot shows the trend

Converting unix timestamp to calendar day

unix2POSIXct <- function (time) structure(time, class =

c("POSIXt", "POSIXct"))

> unix2POSIXct(1314860375)

[1] "2011-08-31 23:59:35 PDT"

http://stackoverflow.com/questions/8293547/how-to-plot-a-subset-of-a-data-frame-in-r

--------------------------------------------------------------------

dat <- data.frame(temperature = seq(-20, 40, by = 1),

altitude = seq(50, 2500, length = 61))

plot(dat)

plot(altitude ~ temperature, data = dat, subset = temperature >= 0)

plot(dat)

http://www.slideshare.net/jeffreybreen/grouping-summarizing-data-in-r

https://code.google.com/p/sqldf/

http://stackoverflow.com/questions/6370383/whats-the-r-way-to-do-the-following-group-by

http://stackoverflow.com/questions/11349741/group-by-like-command-in-r-with-min-as-aggregate-function-and-multiple-columns

------------------------------------------------------

create table d(timestamp int, value int, weekday varchar(4), month varchar(4), day int, time varchar(10),year int, hour int, minute int);

truncate table d;

IMPORT FROM CSV FILE 'C:\\BIDGELY\\data_py2.txt' INTO d WITH RECORD DELIMITED BY '\n' FIELD DELIMITED BY ',';

select ' <= 300', count(*) from test where value <= 300

union

select '>300 <=700', count(*) from test where value > 300 AND value <= 700

union

select '>700 <=1000', count(*) from test where value > 700 AND value <= 1000

union

select '>1000 <=2000', count(*) from test where value > 1000 AND value <= 2000

union

select '>2000 <=3000', count(*) from test where value > 2000 AND value <= 3000

union

select '>3000 <=4000', count(*) from test where value > 3000 AND value <= 4000

union

select '>4000 <=5000', count(*) from test where value > 4000 AND value <= 5000

union

select '>5000 <=6000', count(*) from test where value > 5000 AND value <= 6000

union

select '>6000 <=7000', count(*) from test where value > 6000 AND value <= 7000

union

select '>7000 <=8000', count(*) from test where value > 7000 AND value <= 8000

union

select '>8000 <=9000', count(*) from test where value > 8000 AND value <= 9000

union

select '>9000 <=10000', count(*) from test where value > 9000 AND value <= 10000

union

select '>10000 <=11000', count(*) from test where value > 10000 AND value <= 11000

union

select '>11000 <=12000', count(*) from test where value > 11000 AND value <= 12000

union

select '>12000 ', count(*) from test where value > 12000

----------------------------------------------------------------------

select hour, sum(value) from d group by hour order by hour;

select day, count(*), sum(value) from data group by day order by day;

select weekday, count(*), sum(value) from data group by weekday order by weekday;

------------------------------------------------------------------------

import time

file="data.txt"

#file="data_small.txt"

fin=open(file,'r')

for line in fin:

(unixtime,value) =line.rstrip().split(',')

calendar=time.ctime((float(unixtime)))

#print calendar, calendar[11:13], calendar[14]

cal=calendar.replace(' ',',')

hour=calendar[11:13]

min=calendar[14]

print"%s,%s,%s,%s,%s" % (unixtime,value,cal,hour,min)

fin.close()

http://www.statmethods.net/advgraphs/layout.html several plots layout par() mfrow()

http://www.stat.auckland.ac.nz/~paul/Talks/Rgraphics.pdf

http://www.r-bloggers.com/engineering-data-analysis-with-r-and-ggplot2-%E2%80%93-a-google-tech-talk-given-by-hadley-wickham/

http://www.r-statistics.com/

https://sites.google.com/site/r4statistics/

http://www.r-bloggers.com

http://learnr.wordpress.com/ R language

http://learnr.wordpress.com/2009/06/28/ggplot2-version-of-figures-in-lattice-multivariate-data-visualization-with-r-part-1/

http://had.co.nz/ggplot2/book/

Script to produce barplot

File: 0.txt

-6 3318085 22316093102

-5 380604717 1808012186266

-4 133078 1827654758

-3 14970348 72388068541

-1 510845988 990690275168

0 9793840 54337511506

1 10779447 60064465195

2 207670 811929902

3 49057 165876044

6 1379432 7225461223

7 5739374 23773842841

8 25149155 99311740610

9 4695941 44477979918

10 4035427 13287655789

12 984193 3325133371

zero <- read.table("0.txt")

mycolors=c("darkblue","red")

barplot(as.matrix(rbind(100*zero$V2/sum(zero$V2),100*zero$V3/sum(zero$V3))), names.arg = zero$V1, xlab="Timeshift in hours ( -1 corresponds to src_pty=NULL) ",ylab="% from total", main="% of total records (blue) and % of total size(red) per timeshift 2011/02/27 00:00", beside=TRUE ,col=mycolors)

legend("topright", c("%Records","%Size"), cex=0.6, bty="n", fill=mycolors)

----------------------------------

mycolors=rainbow(7)

all <- read.table("0.txt")

step1000 <- read.table("00-step1000.txt")

rand1000 <- read.table("00-rand1000.txt")

step10000 <- read.table("00-step10000.txt")

rand10000 <- read.table("00-rand10000.txt")

stepmln <- read.table("00-step-mln.txt")

randmln <- read.table("00-rand-mln.txt")

barplot(as.matrix(rbind(100*all$V3/sum(all$V3), 100*step1000$V3/sum(as.numeric(step1000$V3)), 100*rand1000$V3/sum(as.numeric(rand1000$V3)), 100*step10000$V3/sum(as.numeric(step10000$V3)), 100*rand10000$V3/sum(as.numeric(rand10000$V3)), 100*stepmln$V3/sum(as.numeric(stepmln$V3)), 100*randmln$V3/sum(as.numeric(randmln$V3)))), names.arg = step1000$V1, xlab="Timeshift in hours ",ylab="% from total ABF1 feed size", main="ABF1 1 hour feed: % of total size per timeshift 2011/02/27 00:00", beside=TRUE ,col=mycolors)

legend("topright", c("Counting size of all records", "Sampling step=1000", "Sampling randomly 1000", "Sampling step=10,000", "Sampling randomly 10,000", "Sampling step=1,000,000", "Sampling randomly 1,000,000"), cex=0.6, bty="n", fill=mycolors )

--------------------------------

barplot(zero$V3/1000000000, names.arg = zero$V1, xlab="Timeshift in hours ( -1 corresponds to src_pty=NULL) ",ylab="GB", main="GB per timeshift for /data/FETL/ABF1_SEQ/201102270000/" )

----------------------------------------

mycolors=rainbow(5)

barplot( 100*n$V2/sum(n$V2), names.arg = n$V1, col=mycolors, xlab="filterTag ",ylab="% of records with src_pty = NULL", main =" ABF1 records distibution for src_pty = NULL" )

-----------------------------------

Page updated

Google Sites

Report abuse