03 May 2016
Virtualenv is a tool for creating isolated ‘virtual’ python environment. It makes sure different versions of packages and dependencies can exist in a same machine.
To install: pip install virtualenv
Basic command:
Here is a great tutorial for virtualenv
.
To use virtualenv install Python3, we can do the following (reference):
pip install --upgrade virtualenv
virtualenv -p python3 envname
17 Apr 2016
R basics
# load data
data(mtcars)
# strcuture of dataframe
str(mtcars)
# dimension of the data
dim(mtcars)
# row names
row.names(mtcars)
# access indiviual variable
mtcars$mpg
# get names
names(mtcars)
# get current directory
getwd()
# set new directory
setwd()
# Setting levels of ordered factors solution
reddit$age.range <- ordered(reddit$age.range,
levels = c('Under 18', '18-24', '25-34', '35-44', '45-54', '55-64', '65 or Above'))
# Alternate Solution
reddit$age.range <- factor(reddit$age.range,
levels = c('Under 18', '18-24', '25-34', '35-44', '45-54', '55-64', '65 or Above'), ordered = T)
Explore one variable
pf <- read.csv('pseudo_facebook.tsv', sep = '\t')
names(pf)
# ggplot
ggplot(aes(x = dob_day), data = pf) +
geom_histogram(binwidth = 1) +
scale_x_continuous(breaks = 1:31)
# use facet_wrap
ggplot(data = pf, aes(x = dob_day)) +
geom_histogram(binwidth = 1) +
scale_x_continuous(breaks = 1:31) +
facet_wrap(~dob_month)
ggplot(aes(x = friend_count), data = subset(pf, !is.na(gender)), binwidth = 25) +
geom_histogram() +
scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50)) +
facet_wrap(~gender)
# make a table
table(pf$gender)
# use by to summary
by(pf$friend_count, pf$gender, summary)
ggplot(aes(x = tenure / 365), data = pf) +
geom_histogram(color = 'black', fill = '#F79420') +
scale_x_continuous(breaks = seq(1, 7, 1), limits = c(0, 7)) +
xlab('Number of years using Facebook') +
ylab('Number of users in sample')
# Tranforming data soultion
library(gridExtra)
p1 <- qplot(x = friend_count, data = pf)
p2 <- qplot(x = log10(friend_count), data = pf)
p3 <- qplot(x = sqrt(friend_count), data = pf)
grid.arrange(p1, p2, p3, ncol = 1)
# alernative method
p1 <- ggplot(aex(x = friend_count), data = pf) +
geom_histogram()
p2 <- p1 + scale_x_log10()
p3 <- p1 + scale_x_sqrt()
grid.arrange(p1, p2, p3, ncol = 1)
# Frequency Polygons
ggplot(aes(x = friend_count, y = ..count../sum(..count..)), data = subset(pf, !is.na(gender))) +
geom_freqpoly(aes(color = gender), binwidth=10) +
scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50)) +
xlab('Friend Count') +
ylab('Percentage of users with that friend count')
# Box plot
# box plot
qplot(x = gender, y = friend_count, data = subset(pf, !is.na(gender)),
geom = 'boxplot') +
coord_cartesian(ylim = c(0, 1000))
# scatter plots
qplot(x = age, y = friend_count, data = pf)
# alternative scatter plots
ggplot(aes(x = age, y = friend_count), data = pf) +
geom_point() +
xlim(13, 90)
# alpha is transparency
# jitter to avoid overploting
ggplot(aes(x = age, y = friend_count), data = pf) +
geom_jitter(alpha = 1/20, position = position_jitter(h = 0)) +
xlim(13, 90) +
coor_trans(y = 'sqrt')
facet_wrap(formula)
facet_wrap(~variable)
facet_grid(formula)
facet_grid(vertical~horizontal)
Explore two variables
# two variables
library(dplyr)
age_groups <- group_by(pf, age)
pf.fc_by_age <- summarise(age_groups,
friend_count_mean = mean(friend_count),
firend_count_median = median(friend_count),
n = n())
pf.fc_by_age <- arrange(pf.fc_by_age)
head(pf.fc_by_age)
# alternative way
pf.fc_by_age <- pf %.%
group_by(age) %.%
summarise(friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n()) %.%
arrange(age)
# overlayering summaries with Raw data
ggplot(aes( x = age, y = friend_count), data = pf) +
coord_cartesian(xlim = c(13, 70), ylim = c(0, 1000)) +
geom_point(alpha = 0.05,
position = position_jitter(h = 0),
color = 'orange') +
coord_trans(y = 'sqrt') +
geom_line(stat = 'summary', fun.y = mean) +
geom_line(stat = 'summary', fun.y = quantitle, fun.args = list(probs = 0.1),
linetype = 2, color = 'blue') +
geom_line(stat = 'summary', fun.y = quantitle, fun.args = list(probs = 0.5),
linetype = 2, color = 'blue') +
geom_line(stat = 'summary', fun.y = quantitle, fun.args = list(probs = 0.9),
linetype = 2, color = 'blue')
# correlation
cor.test(pf$age, pf$friend_count, method = 'pearson')
# alternative way
with(pf, cor.test(age, friend_count, method = 'pearson'))
# subset
with(subset(pf, age <= 70), cor.test(age, friend_count, method = 'pearson'))
# create scatterplots, strong correlations
ggplot(aes(x = www_like_recieved, y = like_received), data = pf) +
geom_point() +
xlim(0, quantile(pf$www_likes_received, 0.95)) +
ylim(0, quantile(pf$likes_received, 0.95)) +
geom_smooth(method = 'lm', color = 'red')
# age with months means
pf.fc_by_age_months <- pf %.%
group_by(age_with_months) %.%
summarise(friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n()) %.%
arrange(age_with_months)
# plot together
p1 <- ggplot(aes(x = age, y = friend_count_mean),
data = subset(pf.fc_by_age_months, age_with_months < 71)) +
geom_line() +
geom_smooth()
p2 <- ggplot(aes(x = age_with_months, y = friend_count_mean),
data = subset(pf.fc_by_age_months, age_with_months < 71)) +
geom_line() +
geom_smooth()
p3 <- ggplot(aes(x = round(age / 5) * 5, y = friend_count_mean),
data = subset(pf.fc_by_age_months, age_with_months < 71)) +
geom_line(stat = 'summary', fun.y = mean)
grid.arrange(p2, p1, p3, ncol = 1)
# third qualitative variable
ggplot(aes(x = gender, y = age), data = subset(pf, !is.na(gender))) +
geom_boxplot() +
stat_summary(fun.y = mean, geom = 'point', shape = 4)
ggplot(aes(x = age, y = friend_count), data = subset(pf, !is.na(gender))) +
geom_line(aes(color = gender), stat = 'summary', fun.y = median)
pf.fc_by_age_gender <- pf %.%
filter(!is.na(gender)) %.%
group_by(age, gender) %.%
summarise(mean_friend_count = mean(friend_count),
median_friend_count = median(friend_count),
n = n()) %.%
ungroup() %.%
arrange(age)
# cut a variable
pf$year_joined <- floor(2014 - pf$tenure/365)
pf$year_joined.bucket <- cut(pf$year_joined,
c(2004, 2009, 2011, 2012, 2014))
# Friend rate
with(subset(pf, tenure >= 1), summary(friend_count / tenure))
Explor many variables
# load data and see its structure
yo <- read.csv('yogurt.csv')
str(yo)
# change the id from an int to a factor
yo$id <- factor(yo$id)
str(yo)
# histogram
qplot(data = yo, x = price, fill = I('#F79420'))
# numer of purchurses
summary(yo)
length(unique(yo$price))
table(yo$price)
# add new variable all.purchases
yo <- tansform(yo, all.purchases = strawberry + blueberry + pina.colada + plain + mixed.berry)
# scatter plot
ggplot(aex(x = time, y = price), data = yo) +
geom_jitter(alpha = 1/4, shape = 21, fill = I('#F79420'))
# look at samples of households
set.seed(4230)
sample.ids <- sample(levels(yo$id), 16)
ggplot(aes(x = time, y = price),
data = subset(yo, id %in% sample.ids)) +
facet_wrap( ~ id) +
geom_line() +
geom_point(aes(size = all.purchases), pch = 1)
# scatterplot matrices
library(GGally)
theme_set(theme_minimal(20))
set.seed(1836)
pf_subset <- pf[, c(2:15)]
names(pf_subset)
ggpairs(pf_subset[sample.int(nrow(pf_subset), 1000), ])
######################
# even more variables
nci <- read.table('nci.tsv')
# change the colnames to produce a nicer plot
colnames(nci) <- c(1:64)
# creat a heat map
# melt data to long format
nci.long.samp <- melt(as.matrix(nci[1:200, ]))
names(nci.long.samp) <- c('gene', 'case', 'value')
head(nci.long.samp)
# make the heat map
ggplot(aes(y = gene, x = case, fill = value),
data = nci.long.samp) +
geom_title() +
scale_fill_gradientn(colors = colorRampPalette(c('blue', 'red'))(100))
References
14 Apr 2016
How to represent meaning in a computer?
- Common answer: use a taxonomy like WordNet that has hypernyms (is-a) relationships and synonym
- missing nuances, new words
- subjective
- require human labor to create and adpat
- hard to compute accurate word similarity
- “one-hot” representation
- Distributional similarity based represntations
- word-document cooccurrence matrix will give general topics
- window around each word captures both syntactic and semantic information
Problems with simple cooccurrence vectors
- Increase in size with vocabulary
- Very high dimension: require large storage
- Subsequent classification models have sparsity issue
- Models are less robust
Solution: store most of the important information in a fixed, small number of dimensions with the help of dimensionality reduction methods (e.g. apply PCA to cooccurrence matrix)
Problem with PCA:
- Computational cost scales quadratically (bad for millions of words and documents)
- Hard to incorporate new words or documents
- Different learning regime than other DL models
Idea: directly learn low-dimensional word vectors
- Learning representations by back-propagating errors
- Neural probabilistic language models
- Word2Vec
Main idea of Word2Vec:
- Instead of capturing cooccurence counts directly
- Predict surrounding workds of every word
- Faster and can easily incorporate new sentences and documents
Details of Word2Vec:
- Predict surrounding words in a windwo of length m of every word
- Objective function: maximize the log probability of any context word given the current center word
- $J(\theta) = \frac{1}{T} \sum_{t=1}^T \sum_{-m \le j \le m, j \ne 0} log p(w_{t+j} \vert w_t)$, $\theta$ represents all variables we optimize
- $p(o \vert c) = \frac{exp(u_o^T v_c)}{\sum_{w=1}^W exp(u_w^Tv_c)}$, where o is the outside word id, c is the center word id, u and v and “center” and “outside” vectors of o and c.
Count based vs direct prediction
- Count based: LSA, PCA
- fast training
- efficient usage of statistics
- primarily used to capture word similarity
- disproportionate importance given to large counts
- Direct prediction: RNN
- scales with corpus size
- inefficient usage of statistics
- generate improved performance on other tasks
- capture complex patterns beyond word similarity
GloVe: combine them together
- Fast training
- Scalable to huge corpora
- Good performance even with small corpus and vectors
- $J(\theta) =\frac{1}{2} \sum_{i,j=1}^W f(P_{ij})(u_i^T v_j - log P_{ij})^2$
Skip-gram model: train binary logistic regressions for a true pair (center word and word in its context windwo) and a couple of random pairs (the center word with a random word)
Continous bag of words model: predict center word from sum of surrounding word vectors instead of predicting surrounding single words from center word as in skipgram model
- If only have a small training dataset, do not train the word vectors.
- If have a very large dataset, it may work better to train word vectors.
General strategy for successful Neural Nets
- Select network structure appropriate for problem: sigmoid, tanh, ReLu
- Check for implementation bugs with gradient checks
- Parameter initialization
- Initialize hidden layer biases to 0 and output biases to optimal value if weights were 0
- Initialize weights ~uniform(-r, r), r inversely proportional to fan_in (previous layer size) and fan_out(next layer size) $\sqrt{6/(fan_in + fan_out)}$
- Optimization:
- SGD usually wins over all batch methods on large datasets
- L-BFGS or Conjugate Gradients win on smaller datasets
- Learning rates
- Simplest recipe: keep it fixed and use the same for all parameters
- Collobert scales them by the inverse of square root of the fan-in of each neuron
- Better results can generally be obtained by allowing learning rates to decrease in O(1/t)
- Better yet: no hand-set learning rates by using L-BFGS or AdaGrad
- Prevent overfitting
- simple first step: reduce model size by lowering number of units and layers and other parameters
- standard L1 or L2 regularization on weights
- early stopping: use parameter that gave best validation error
- sparsity constraints on hidden activations
- dropout
Adagrad:
- Standard SGD, fixed $\alpha$: $\theta_{new} = \theta_{old} - \alpha$ $\nabla_{\theta} J_{t}(\theta)$
- Instead: adpative learning rates, learning rate is adapting differently for each parameter and parameters get larger updates than frequently occurring parameters.
- $g_{t,i} = \frac{\partial}{\partial \theta_{t,i}}J_t(\theta)$
- $\theta_{t,i} = \theta_{t-1, i} - \frac{\alpha}{\sqrt{\sum_{\tau=1}^t}}g_{t,i}$
Deep learning tricks of the trade (Y. Bengio, 2012):
- unsupervised pre-training
- SGD and setting learning rates
- main hyper-parameters
- learning rate schedule and early stopping
- minibatches
- parameter initialziation
- number of hidden units
- L1 or L2 weight decay
- sparsity regularization
- how to efficiently search for hyper-parameter configuration:
- random hyperparameter search
A language model computes a probability for a sequence of words. Probability is usually conditioned on window of n previous words.
RNN: condition the neural network on all previous words and tie the weights at each time step
RNN language model: use the same set of W eights at all time step.
- $h_t = \sigma (W^{(hh)}h_{t-1} + W^{(hx)}x_{t})$
- $\hat{y}_t = softmax(W^{(S)}h_t)$
- $\hat{P}(x_{t+1}=v_j \vert x_t, \ldots, x_1) = \hat{y}_{t,j}$
- $J^{(t)}(\theta) = - \sum_{j=1}^{\vert V \vert} y_{t,j} log \hat{y}_{t,j}$
- $ J = \frac{1}{T} \sum_{t=1}^T J^{(t)}$
Vanishing or exploding gradient problem for RNN: the gradient is a product of Jacobian matrices, each associated with a step in the forward computation. This can become very small or very large qucikly, and the locality assumption of gradient descent breaks down.
The solution first introduce by Mikolov is to clip gradients to a maximum value. Pesudo code for norm clipping:
- $g <- \frac{\partial \epsilon}{\partial \theta}$
- if $\Vert g \Vert \ge$ threshold then
- $ g <- \frac{threshold}{\Vert g \Vert} g$
- end if
For vanishing gradients: initialization + ReLU
- Initialize W to indentity matrix and $f(z) = rect(z) = max(z, 0)$
Bidirectional RNN: for classification we want to incorporate information from words both preceding and following
Semantic Vector Spaces: vectors representing phrases and setences that do not ignore word order and capture semantics for NLP tasks.
Model comparison:
- Bag of Vectors: surprisingly good baseline for simple classification problems. Especially if followed by a few layers.
- Window Model: good for single word classification for problems that do not need wide context
- CNNs: good for classification, unclear how to incorporate phrase level annotation (can only take a single label), need zero padding for shorter phrases, hard to interpret, easy to parallelize on GPUs
- Recursive Neural Networks: most linguistically plausible, interpretable, provide most important phrases, need parse trees
- Recurrent Neural Networks: most cognitively plausible (reading from left to right), not usually the highest classification performance but lots of improvements right now with gates.
13 Apr 2016
Loss function
$L = \frac{1}{N} \sum_{i=1}^N \sum_{j \neq y_i}$ max $(0, f_j(x_i, W) -f_{y_j}(x_i, W)+1)$+$\lambda R(W)$
In commom use:
- L2 regularization $R(W) = \sum_k \sum_l W_{k,l}^2$
- L1 regularization $R(W) = \sum_k \sum_l \vert W_{k,l} \vert$
- Elastic net (L1+L2) $R(W) = \sum_k \sum_l \beta_1 W_{k,l}^2 + \beta_2 \vert W_{k,l} \vert$
- Dropout: randomly set some neurons to zeros in the forward pass; force the network to have a redundant representation; need to scale at test time.
Loss function type:
- Softmax $L_i = -log(\frac{e^{s_{y_j}}}{\sum_j e^{s_j}})$
- SVM (hinge loss) $L_i = \sum_{j \neq y_j} max(0, s_j - s_{y_i} + 1)$
In practice: always use analytic gradient, but check implementation with numerical gradient. This is called gradient check.
Activation functions
- Sigmoid $\sigma (x) = \frac{1}{(1+e^{-x})}$
- saturated neurons “kill” the gradients
- Sigmoid outputs are not zero-centered
- exp() is a bit compute expensive
- tanh(x)
- squashes numbers to range [-1, 1]
- zero centered
- still kills gradients when saturated
- ReLU ( Rectified Linear Unit) $f(x) = max(0, x)$
- does not saturate
- very computationally efficient
- converges much faster than sigmoid/tanh in practice
- not zero-centered output
- Leaky ReLU $f(x) = max(\alpha x, x)$
- does not saturate
- computationally efficient
- converges much faster than sigmoid/tanh in practice
- will not “die”
- Exponential Linear Units (ELU) $f(x) = x $ if $x > 0$; $f(x) = \alpha (exp(x) -1)$ if $x \le 0$
- all benefits of ReLU
- does not die
- closer to zero mean outputs
- computation requires exp()
- Maxout “neuron” $max(w_1^T x + b_1, w_2^Tx+b_2)$
- generalize ReLu and Leaky ReLU
- linear regime, does not saturate, does not die
- doubles the number of parameters
In practice
- Use ReLU. Be careful with learning rates
- Try out Leaky ReLU/ Maxout/ ELU
- Try out tanh but do not expect much
- Do not use sigmoid
Weight initilization
- Small random numbers: works okay for small networks, but can lead non-homogeneous distributions of activations across the layers of a network
- Xavier initilization
Batch normalization:
- Improves gradient flow through the network
- Allow higher learning rates
- Reduces the strong dependence on initialization
- Acts as a form of regularization
Hyperparameters to play with:
- network architecture
- learning rate, its decay schedule, update type
- regularization
Step update
- Gradient descent
x += -learning_rate * dx
- Momentum update
v = mu * v - learning_rate * dx # integrate velocity
x += v # integrate position
- physical interpretation as ball rolling down the loss function + friction (mu coefficient)
- mu = usually ~0.5, 0.9, or 0.99 (or annealed over time)
- Nesterov momentum update
- $ v_t = \mu v_{t-1} - \epsilon \nabla f(\theta_{t-1} + \mu \v_{t-1}) $
- $ \theta t = \theta{t-1} + v_t$
- use variable tansform
- $ v_t = \mu v_{t-1} - \epsilon \nabla f(\phi)$
- $\phi_t = \phi_{t-1} - \mu v_{t-1} + (1+\mu) v_t$
- code
v_pre = v
v = mu * v -learning_rate * dx
x += -mu * v_prev + (1 + mu) * v
- AdaGrad update
cache += dx**2
x += - learning_rate * dx / (np.sqrt(cache) + 1e-7)
- added element-wise scaling of the gradient based on the historical sum of squares in each dimension
- RMSProp update
cache = decay_rate * cache + (1 - decay_rate) * dx**2
x += - learning_rate * dx / (np.sqrt(cache) + 1e-7)
- Adam update
m = beta1 * m + (1-beta1) * dx # update first moment
v = beta2 * v + (1-beta2) * (dx**2) # update second moment
mb = m/(1-beta1**t) # correct bias
vb = v/(1-beta2**t) # correct bias
x += -learning_rate * mb / (np.sqrt(vb) + 1e-7
Learning rate decay over time:
- step decay: e.g. decay learning rate by half every few epochs
- exponential decay: $\alpha = \alpha_0 e^{-kt}$
- 1/t decay: $\alpha = \alpha_0 / (1+kt)$
Second order optimization methods:
- second order Taylor: $J(\theta) = J(\theta_0) + (\theta-\theta_0)^T \nabla _{\theta}J(\theta_0) + \frac{1}{2} (\theta - \theta_0) ^T H (\theta - \theta_0)$
- solve for critical point: $\hat{\theta} = \theta_0 - H^{-1} \nabla_{\theta} J(\theta_0)$
- Quasi-Newton methods (e.g. BGFS): instead of inverting the Hessian, approximate inverse Hessian with rank 1 updates over time
- L-BFGS (Limited memory BFGS): does not form/store the full inverse Hessian
- usually works very well in full batch, deterministic code
- does not transfer very well to mini-batch setting
In practice:
- Adam is a good default choice in most cases
- Try L-BFGS if can afford to do full batch updates
Convolutional Neural Network (CNN)
ConvNet is a sequence of Convolution Layers, interspersed with activation functions.
$f(x,y)*g(x,y) = \sum_{n_1 = -\infty}^{\infty} \sum_{n2=-\infty}^{\infty} f(n_1, n_2) g(x-n_1, y-n_2)$
The Conv Layer:
- Accepts a volume of size $W_1 \times H_1 \times D_1$
- Requires four hyperparameters:
- Number of filters $K$,
- their spatial extent $F$,
- the stride $S$,
- the amount of zero padding $P$.
- Produces a volume of size $W_2 \times H_2 \times D_2$ where:
- $W_2 = (W_1 - F + 2P)/S + 1$
- $H_2 = (H_1 - F + 2P)/S + 1$ (i.e. width and height are computed equally by symmetry)
- $D_2 = K$
- With parameter sharing, it introduces $F \cdot F \cdot D_1$ weights per filter, for a total of $(F \cdot F \cdot D_1) \cdot K$ weights and $K$ biases.
- In the output volume, the $d$-th depth slice (of size $W_2 \times H_2$) is the result of performing a valid convolution of the $d$-th filter over the input volume with a stride of $S$, and then offset by $d$-th bias.
- Common settings: $F=3$, $S=1$, $P=1$.
Pooling layer:
- make the representations smaller and more manageable
- operates over each activation map independently
Generally, the pooling layer:
- Accepts a volume of size $W_1 \times H_1 \times D_1$
- Requires three hyperparameters:
- their spatial extent $F$,
- the stride $S$,
- Produces a volume of size $W_2 \times H_2 \times D_2$ where:
- $W_2 = (W_1 - F)/S + 1$
- $H_2 = (H_1 - F)/S + 1$
- $D_2 = D_1$
- Introduces zero parameters since it computes a fixed function of the input
- Note that it is not common to use zero-padding for Pooling layers
- Common setting: $F = 2, S=2$; $F=3, S=2$.
How to stack convolutions:
- Replace large convolutions (5x5, 7x7) with stacks of 3x3 convolutions
- 1x1 “bottleneck” convolutions are very efficient
- Can factor NxN convolutions into 1xN and Nx1
- All of the above give fewer parameters, less compute, and more nonlinearity
Convolution Theorem: The convolution of f and g is equal to the elementwise product of their Fourier Transforms: $F(f * g) = F(f)F(g)$. Using the FFT, we can compute the DFT of an N-dimension vector in O(NlogN) time.
Implement convolutions FFT:
- Compute FFT of weights: F(W)
- Compute FFT of image: F(X)
- Compute elementwise product: F(W)F(X)
- Compute inverse FFT: $Y=F^{-1}(F(W)F(X))$
Segmentation:
- Semantic segmentation
- classify all pixels
- fully convolutional models, downsample then upsample
- learnable upsampling: fractionally strided convolution
- skip connections can help
- Instance segmentation
- detect instance, generate mask
- similar pipelines to object detection
Attention:
- Soft attention:
- easy to implement: produce distribution over input locations, reweight features and feed as input
- attend to arbitrary input locations using spatial transformer networks
- Hard attention:
- attend to a single input location
- cannot use gradient descent
- need reinforcement learning
Unsupervised learning:
- Autoencoders
- Traditional: feature learning, reconstruct input, not used much anymore
- Variational: generate samples, Bayesian meets deep learning
- Generative adversarial networks: generate samples
Recurrent Neural Networks (RNN)
$ h_t = f_w (h_{t-1}, x_t)$, where $h_t$ is new state, $h_{t-1}$ is old state, $f_w$ is some function with parameters $w$, $x_t$ is input vector at some time step.









10 Apr 2016
1. Docker Basics
Docker is an open standard platforms for developing, packaging, and running portable distributed applications.
What makes Docker images unique and different from other virtual machines is that while each virtual machine image runs on a separate guest OS, the Docker images run within the same OS kernel.
Install Docker with the Official Get Started with Docker
Docker cheet sheet
An instance of an image is called container.
List all images: $docker images
List all containers: $docker ps -a
Download a pre-built image (ubuntu): $docker pull ubuntu
Run a container from a specific image:
$ sudo docker run -i -t <image_id || repository:tag> /bin/bash
Start a existed container:
$ sudo docker start -i <image_id>
Attach a running container:
$ sudo docker attach <container_id>
Exit without shutting down a container:
[Ctrl-p] + [Ctrl-q]
Start a new container:
$ JOB=$(docker run -d ubuntu /bin/sh -c "while true; do echo Hello world; sleep 1; done")
Stop the container: $ docker stop $JOB
Start the container: $ docker start $JOB
Restart the container: $ docker restart $JOB
Kill a container: $ docker kill $JOB
Remove a container:
$ docker stop $JOB # Container must be stopped to remove it
$ docker rm $JOB
or $ docker rm <container_id>
Remove an image:
$ docker rmi <image_id> # container must be removed first
Search for images command:$ docker search <image_name>
Pull image: $ docker pull <image_name>
Commit your container to a new named image:
# run a container
$ docker run -it <image_name> /bin/bash
# make some changes in the container, then exit.
# commit a copy of this container to an image.
$ docker commit -m "changes content" <container_id> <new_image_name>
# -m is for the commit message
Build an image:
# create a Dockerfile like the following:
FROM ubuntu:14.04
MAINTAINER Kate Smith <ksmith@example.com>
RUN apt-get update && apt-get install -y ruby ruby-dev
RUN gem install sinatra
$ docker build - t <user>/<image_name>:<tag> .
Docker command flags:
-d
: run in background
-P
: map any require network
-i
: interactive
2. Docker for Hadoop
The image I used for launch Hadoop is Hadoop-docker. Following the instruction on the readme file, you can launch and test the Hadoop platfrom pretty easily.
To use the image: $ docker run -it sequenceiq/hadoop-docker:2.7.1 /etc/bootstrap.sh -bash
3. Docker for Spark
Docker-spark
To use the image: $ docker run -d -h sandbox sequenceiq/spark:1.6.0 -d