# Refresher R for Beginners

R Studio Environment

R Location (OSX)

\$ ls –l /Library/Frameworks/R.framework/Versions

#Get R Version

version

Environment

getwd()

setwd(“/Users/avkashchauhan/work/global”)

getwd()

dir()

#Getting Help

help(getwd)

filename <- “test.csv”

filex

summary(filex)

filex\$id

filex\$name

filex\$age

filex\$zip

names(filex)

attributes(filex)

# Listing All Vars

ls()

# ls() – List of all variables

# DataTypes & number Assignment

asc <- c(1,2,3,4,5,6,7,8,9,10)

# What is c? c is “combine”

asc[2]

asc[5]

asc[5:6]

asc[1:9]

View(asc)

a <- 10

a

a[1]

a[3]

help(sqrt)

a <- sqrt(10)

a

a <- sqrt(10*a)

a

asc

mean(asc)

median(asc)

help(var)

typeof(asc)

typeof(a)

# String data type

a <- c(“this”, “is”, “so”, “fun”)

a

a[1]

typeof(a)

#Understanding c or combine

a <- 10

> a

[1] 10

> a[1]

[1] 10

> a[2]

[1] NA

> a <- c(10)

> a

[1] 10

> a[2]

[1] NA

# DATAFRAME

# creating a data frame

a <- c(1,2,3,4,5,6,7,8,9,10)

b <- c(10,20,30,40,50,60,70,80,90,100)

ab <- data.frame(first=a, second=b)

ab

ab[1]

ab[1][1]

ab[1][2] ß XXX

ab[2]

ab[2][1]

ab[2][2] ß XXX

ab\$first

ab\$second

ab\$second[1]

ab\$second[3]

ab\$first[10]

View(ab)

#Logical

a <- c(TRUE)

a

typeof(a)

a <- c(FALSE)

a

typeof(a)

#Conditions in R

a <- c(TRUE)

if(!a) a <- c(FALSE)

a ß Still TRUE

if(a) a <- c(FALSE)

a ß FALSE Now

a <- c(TRUE,FALSE)

a

a[1]

a[2]

if (a[1]) a[2] <- TRUE

a

Factor in R – A “factor” is a vector whose elements can take on one of a specific set of values. For example, “Sex” will usually take on only the values “M” or “F,” whereas “Name” will generally have lots of possibilities. The set of values that the elements of a factor can take are called its levels.

a <- factor(c(“Male”, “Female”, “Female”, “Male”, “Male”))

a

a <- factor(c(“A”,”A”,”B”,”A”,”B”,”B”,”C”,”A”,”C”))

a

Tables: (One way and two way)

a <- factor(c(“Male”, “Female”, “Female”, “Male”, “Male”))

a

mytable <- table(a)

a

mytable

summary(a)

attributes(a)

#datatype check R

#Example #1

a <- c(1,2,4)

is.numeric(a)

is.factor(a)

#Example #2

b <- factor(c(“M”, “F”))

b

is.factor(b)

is.numeric(b)

Graph Plotting in R

Using Library ggplot2

#installing ggplot2

install.packages(“ggplot2”)

also installing the dependencies ‘colorspace’, ‘Rcpp’, ‘stringr’, ‘RColorBrewer’, ‘dichromat’, ‘munsell’, ‘labeling’, ‘plyr’, ‘digest’, ‘gtable’, ‘reshape2’, ‘scales’, ‘proto’

Using ggplot2 Library

library(ggplot2)

detach(package:ggplot2)

View(diamonds)

qplot(clarity, data=diamonds, fill=cut, geom=”bar”)

qplot(clarity, data=diamonds, geom=”bar”, fill=cut, position=”stack”)

qplot(clarity, data=diamonds, geom=”freqpoly”, group=cut, colour=cut, position=”identity”)

qplot(carat, data=diamonds, geom=”histogram”, binwidth=0.1)

qplot(carat, data=diamonds, geom=”histogram”, binwidth=0.01)

Keywords:  R, Analysis, ggplot,

Lets start from Cloudera Enterprise Data Hub:

Here is the offering from Hortonworks:

And this is how MapR is packaging Enterprise Hadoop

And finally Pivotal Enterprise Hadoop offering:

Keywords: Apache Hadoop, Cloudera, Hortonworks, Pivotal, MapR, Big Data

# Big Data – Transition from Velocity, Variety and Volume to adding Variability and Complexity in the mix

Previous Definition: Velocity, Variety and Volume

New Definition: Velocity, Variety and Volume + Variability and Complexity

# A collection of Big Data Books from Packt Publication

I found that Packt publication have few great books on Big Data and here is a collection of few books which I found very useful:

Packt is giving its readers a chance to dive into their comprehensive catalog of over 2000 books and videos for the next 7 days with LevelUp program:

Packt is offering all of its eBooks and Videos at just \$10 each or less

The more EXP customers want to gain, the more they save:

• Any 1 or 2 eBooks/Videos – \$10 each
• Any 3 to 5 eBooks/Videos – \$8 each
• Any 6 or more eBooks/Videos – \$6 each

# Data Data Everywhere

Data Data Everywhere

# 3 Level of Analytics

3 Level of Analytics