Notes for virtualenv

03 May 2016

Notes for virtualenv

Virtualenv is a tool for creating isolated ‘virtual’ python environment. It makes sure different versions of packages and dependencies can exist in a same machine.

To install: pip install virtualenv

Basic command:

virtualenv ENV: ENV is a directory to place the new virtual enviornment.
- ENV/lib/ and ENV/include/ are created and used to store library files for a new virtualenv python.
- ENV/bin is created for executables files. Runing a script with #! /path/to/ENV/bin/python would run that script under this virtualenv’s python.
Enable a virtual environment: source ENV/bin/activate
Disable a virtual environment: deactivate
Delte a virtual environment: deactivate and rm -r ENV
Make envrionment relocatable: virtualenv --relocatable ENV
Check what is installed:
- pip install yolk
- yolk -l

Here is a great tutorial for virtualenv.

To use virtualenv install Python3, we can do the following (reference):

pip install --upgrade virtualenv
virtualenv -p python3 envname

Notes for Data Analysis with R at Udacity

17 Apr 2016

R
Udacity

Notes for Data Analysis with R at Udacity

R basics

# load data
data(mtcars)
# strcuture of dataframe
str(mtcars)
# dimension of the data
dim(mtcars)
# row names 
row.names(mtcars)
# access indiviual variable
mtcars$mpg
# get names
names(mtcars)

# get current directory
getwd()
# set new directory
setwd()

# Setting levels of ordered factors solution
reddit$age.range <- ordered(reddit$age.range, 
                             levels = c('Under 18', '18-24', '25-34', '35-44', '45-54', '55-64', '65 or Above'))

# Alternate Solution
reddit$age.range <- factor(reddit$age.range, 
                             levels = c('Under 18', '18-24', '25-34', '35-44', '45-54', '55-64', '65 or Above'), ordered = T)

Explore one variable

pf <- read.csv('pseudo_facebook.tsv', sep = '\t')
names(pf)

# ggplot
ggplot(aes(x = dob_day), data = pf) + 
  geom_histogram(binwidth = 1) + 
  scale_x_continuous(breaks = 1:31)

# use facet_wrap
ggplot(data = pf, aes(x = dob_day)) + 
  geom_histogram(binwidth = 1) + 
  scale_x_continuous(breaks = 1:31) + 
  facet_wrap(~dob_month)

ggplot(aes(x = friend_count), data = subset(pf, !is.na(gender)), binwidth = 25) +   
  geom_histogram() +
  scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50)) + 
  facet_wrap(~gender)

# make a table
table(pf$gender)
# use by to summary
by(pf$friend_count, pf$gender, summary)

ggplot(aes(x = tenure / 365), data = pf) + 
  geom_histogram(color = 'black', fill = '#F79420') + 
  scale_x_continuous(breaks = seq(1, 7, 1), limits = c(0, 7)) + 
  xlab('Number of years using Facebook') + 
  ylab('Number of users in sample')

# Tranforming data soultion
library(gridExtra)

p1 <- qplot(x = friend_count, data = pf)
p2 <- qplot(x = log10(friend_count), data = pf)
p3 <- qplot(x = sqrt(friend_count), data = pf)

grid.arrange(p1, p2, p3, ncol = 1)

# alernative method
p1 <- ggplot(aex(x = friend_count), data = pf) +
  geom_histogram()
p2 <- p1 + scale_x_log10()
p3 <- p1 + scale_x_sqrt()

grid.arrange(p1, p2, p3, ncol = 1)

# Frequency Polygons
ggplot(aes(x = friend_count, y = ..count../sum(..count..)), data = subset(pf, !is.na(gender))) + 
  geom_freqpoly(aes(color = gender), binwidth=10) + 
  scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50)) + 
  xlab('Friend Count') + 
  ylab('Percentage of users with that friend count')

# Box plot
# box plot
qplot(x = gender, y = friend_count, data = subset(pf, !is.na(gender)),
      geom = 'boxplot') + 
  coord_cartesian(ylim = c(0, 1000))

# scatter plots
qplot(x = age, y = friend_count, data = pf)
# alternative scatter plots
ggplot(aes(x = age, y = friend_count), data = pf) + 
  geom_point() + 
  xlim(13, 90)
# alpha is transparency
# jitter to avoid overploting
ggplot(aes(x = age, y = friend_count), data = pf) + 
  geom_jitter(alpha = 1/20, position = position_jitter(h = 0)) + 
  xlim(13, 90) + 
  coor_trans(y = 'sqrt')

Facets

facet_wrap(formula)
facet_wrap(~variable)
facet_grid(formula)
facet_grid(vertical~horizontal)

Explore two variables

# two variables
library(dplyr)
age_groups <- group_by(pf, age)
pf.fc_by_age <- summarise(age_groups,
          friend_count_mean = mean(friend_count),
          firend_count_median = median(friend_count),
          n = n())
pf.fc_by_age <- arrange(pf.fc_by_age)
head(pf.fc_by_age)

# alternative way
pf.fc_by_age <- pf %.%
  group_by(age) %.%
  summarise(friend_count_mean = mean(friend_count),
            friend_count_median = median(friend_count),
            n = n()) %.%
  arrange(age)

# overlayering summaries with Raw data
ggplot(aes( x = age, y = friend_count), data = pf) + 
  coord_cartesian(xlim = c(13, 70), ylim = c(0, 1000)) + 
  geom_point(alpha = 0.05,
             position = position_jitter(h = 0),
             color = 'orange') + 
  coord_trans(y = 'sqrt') +
  geom_line(stat = 'summary', fun.y = mean) + 
  geom_line(stat = 'summary', fun.y = quantitle, fun.args = list(probs = 0.1), 
            linetype = 2, color = 'blue') + 
  geom_line(stat = 'summary', fun.y = quantitle, fun.args = list(probs = 0.5), 
            linetype = 2, color = 'blue') + 
  geom_line(stat = 'summary', fun.y = quantitle, fun.args = list(probs = 0.9), 
            linetype = 2, color = 'blue')

# correlation
cor.test(pf$age, pf$friend_count, method = 'pearson')
# alternative way
with(pf, cor.test(age, friend_count, method = 'pearson'))
# subset
with(subset(pf, age <= 70), cor.test(age, friend_count, method = 'pearson'))

# create scatterplots, strong correlations
ggplot(aes(x = www_like_recieved, y = like_received), data = pf) + 
  geom_point() + 
  xlim(0, quantile(pf$www_likes_received, 0.95)) + 
  ylim(0, quantile(pf$likes_received, 0.95)) + 
  geom_smooth(method = 'lm', color = 'red')

# age with months means
pf.fc_by_age_months <- pf %.%
  group_by(age_with_months) %.%
  summarise(friend_count_mean = mean(friend_count),
            friend_count_median = median(friend_count),
            n = n()) %.%
  arrange(age_with_months)

# plot together
p1 <- ggplot(aes(x = age, y = friend_count_mean), 
       data = subset(pf.fc_by_age_months, age_with_months < 71)) + 
  geom_line() + 
  geom_smooth()

p2 <- ggplot(aes(x = age_with_months, y = friend_count_mean), 
       data = subset(pf.fc_by_age_months, age_with_months < 71)) + 
  geom_line() + 
  geom_smooth()

p3 <- ggplot(aes(x = round(age / 5) * 5, y = friend_count_mean), 
             data = subset(pf.fc_by_age_months, age_with_months < 71)) + 
  geom_line(stat = 'summary', fun.y = mean)
grid.arrange(p2, p1, p3, ncol = 1)

# third qualitative variable
ggplot(aes(x = gender, y = age), data = subset(pf, !is.na(gender))) + 
  geom_boxplot() + 
  stat_summary(fun.y = mean, geom = 'point', shape = 4)

ggplot(aes(x = age, y = friend_count), data = subset(pf, !is.na(gender))) + 
  geom_line(aes(color = gender), stat = 'summary', fun.y = median)


pf.fc_by_age_gender <- pf %.%
  filter(!is.na(gender)) %.%
  group_by(age, gender) %.%
  summarise(mean_friend_count = mean(friend_count),
            median_friend_count = median(friend_count), 
            n = n()) %.%
  ungroup() %.%
  arrange(age)

# cut a variable
pf$year_joined <- floor(2014 - pf$tenure/365)
pf$year_joined.bucket <- cut(pf$year_joined, 
                             c(2004, 2009, 2011, 2012, 2014))


# Friend rate
with(subset(pf, tenure >= 1), summary(friend_count / tenure))

Explor many variables

# load data and see its structure
yo <- read.csv('yogurt.csv')
str(yo)

# change the id from an int to a factor
yo$id <- factor(yo$id)
str(yo)

# histogram 
qplot(data = yo, x = price, fill = I('#F79420'))

# numer of purchurses
summary(yo)
length(unique(yo$price))
table(yo$price)

# add new variable all.purchases
yo <- tansform(yo, all.purchases = strawberry + blueberry + pina.colada + plain + mixed.berry)

# scatter plot
ggplot(aex(x = time, y = price), data = yo) + 
  geom_jitter(alpha = 1/4, shape = 21, fill = I('#F79420'))

# look at samples of households
set.seed(4230)
sample.ids <- sample(levels(yo$id), 16)

ggplot(aes(x = time, y = price),
       data = subset(yo, id %in% sample.ids)) + 
  facet_wrap( ~ id) + 
  geom_line() + 
  geom_point(aes(size = all.purchases), pch = 1)

# scatterplot matrices
library(GGally)
theme_set(theme_minimal(20))
set.seed(1836)
pf_subset <- pf[, c(2:15)]
names(pf_subset)
ggpairs(pf_subset[sample.int(nrow(pf_subset), 1000), ])


######################
# even more variables
nci <- read.table('nci.tsv')

# change the colnames to produce a nicer plot
colnames(nci) <- c(1:64)

# creat a heat map
# melt data to long format
nci.long.samp <- melt(as.matrix(nci[1:200, ]))
names(nci.long.samp) <- c('gene', 'case', 'value')
head(nci.long.samp)
# make the heat map
ggplot(aes(y = gene, x = case, fill = value),
       data = nci.long.samp) + 
  geom_title() + 
  scale_fill_gradientn(colors = colorRampPalette(c('blue', 'red'))(100))

References

Data Wrangling with dplyr and tidyr

Notes for Deep Learning for NLP

14 Apr 2016

Notes for Deep Learning for NLP

How to represent meaning in a computer?

Common answer: use a taxonomy like WordNet that has hypernyms (is-a) relationships and synonym
- missing nuances, new words
- subjective
- require human labor to create and adpat
- hard to compute accurate word similarity
- “one-hot” representation
Distributional similarity based represntations
- word-document cooccurrence matrix will give general topics
- window around each word captures both syntactic and semantic information

Problems with simple cooccurrence vectors

Increase in size with vocabulary
Very high dimension: require large storage
Subsequent classification models have sparsity issue
Models are less robust

Solution: store most of the important information in a fixed, small number of dimensions with the help of dimensionality reduction methods (e.g. apply PCA to cooccurrence matrix)

Problem with PCA:

Computational cost scales quadratically (bad for millions of words and documents)
Hard to incorporate new words or documents
Different learning regime than other DL models

Idea: directly learn low-dimensional word vectors

Learning representations by back-propagating errors
Neural probabilistic language models
Word2Vec

Main idea of Word2Vec:

Instead of capturing cooccurence counts directly
Predict surrounding workds of every word
Faster and can easily incorporate new sentences and documents

Details of Word2Vec:

Predict surrounding words in a windwo of length m of every word
Objective function: maximize the log probability of any context word given the current center word
$J(\theta) = \frac{1}{T} \sum_{t=1}^T \sum_{-m \le j \le m, j \ne 0} log p(w_{t+j} \vert w_t)$, $\theta$ represents all variables we optimize
$p(o \vert c) = \frac{exp(u_o^T v_c)}{\sum_{w=1}^W exp(u_w^Tv_c)}$, where o is the outside word id, c is the center word id, u and v and “center” and “outside” vectors of o and c.

Count based vs direct prediction

Count based: LSA, PCA
- fast training
- efficient usage of statistics
- primarily used to capture word similarity
- disproportionate importance given to large counts
Direct prediction: RNN
- scales with corpus size
- inefficient usage of statistics
- generate improved performance on other tasks
- capture complex patterns beyond word similarity

GloVe: combine them together

Fast training
Scalable to huge corpora
Good performance even with small corpus and vectors
$J(\theta) =\frac{1}{2} \sum_{i,j=1}^W f(P_{ij})(u_i^T v_j - log P_{ij})^2$

Skip-gram model: train binary logistic regressions for a true pair (center word and word in its context windwo) and a couple of random pairs (the center word with a random word)

Continous bag of words model: predict center word from sum of surrounding word vectors instead of predicting surrounding single words from center word as in skipgram model

If only have a small training dataset, do not train the word vectors.
If have a very large dataset, it may work better to train word vectors.

General strategy for successful Neural Nets

Select network structure appropriate for problem: sigmoid, tanh, ReLu
Check for implementation bugs with gradient checks
Parameter initialization
- Initialize hidden layer biases to 0 and output biases to optimal value if weights were 0
- Initialize weights ~uniform(-r, r), r inversely proportional to fan_in (previous layer size) and fan_out(next layer size) $\sqrt{6/(fan_in + fan_out)}$
Optimization:
- SGD usually wins over all batch methods on large datasets
- L-BFGS or Conjugate Gradients win on smaller datasets
Learning rates
- Simplest recipe: keep it fixed and use the same for all parameters
- Collobert scales them by the inverse of square root of the fan-in of each neuron
- Better results can generally be obtained by allowing learning rates to decrease in O(1/t)
- Better yet: no hand-set learning rates by using L-BFGS or AdaGrad
Prevent overfitting
- simple first step: reduce model size by lowering number of units and layers and other parameters
- standard L1 or L2 regularization on weights
- early stopping: use parameter that gave best validation error
- sparsity constraints on hidden activations
- dropout

Adagrad:

Standard SGD, fixed $\alpha$: $\theta_{new} = \theta_{old} - \alpha$ $\nabla_{\theta} J_{t}(\theta)$
Instead: adpative learning rates, learning rate is adapting differently for each parameter and parameters get larger updates than frequently occurring parameters.
$g_{t,i} = \frac{\partial}{\partial \theta_{t,i}}J_t(\theta)$
$\theta_{t,i} = \theta_{t-1, i} - \frac{\alpha}{\sqrt{\sum_{\tau=1}^t}}g_{t,i}$

Deep learning tricks of the trade (Y. Bengio, 2012):

unsupervised pre-training
SGD and setting learning rates
main hyper-parameters
- learning rate schedule and early stopping
- minibatches
- parameter initialziation
- number of hidden units
- L1 or L2 weight decay
- sparsity regularization
how to efficiently search for hyper-parameter configuration:
- random hyperparameter search

A language model computes a probability for a sequence of words. Probability is usually conditioned on window of n previous words.

RNN: condition the neural network on all previous words and tie the weights at each time step

RNN language model: use the same set of W eights at all time step.

$h_t = \sigma (W^{(hh)}h_{t-1} + W^{(hx)}x_{t})$
$\hat{y}_t = softmax(W^{(S)}h_t)$
$\hat{P}(x_{t+1}=v_j \vert x_t, \ldots, x_1) = \hat{y}_{t,j}$
$J^{(t)}(\theta) = - \sum_{j=1}^{\vert V \vert} y_{t,j} log \hat{y}_{t,j}$
$ J = \frac{1}{T} \sum_{t=1}^T J^{(t)}$

Vanishing or exploding gradient problem for RNN: the gradient is a product of Jacobian matrices, each associated with a step in the forward computation. This can become very small or very large qucikly, and the locality assumption of gradient descent breaks down.

The solution first introduce by Mikolov is to clip gradients to a maximum value. Pesudo code for norm clipping:

$g <- \frac{\partial \epsilon}{\partial \theta}$
if $\Vert g \Vert \ge$ threshold then
- $ g <- \frac{threshold}{\Vert g \Vert} g$
end if

For vanishing gradients: initialization + ReLU

Initialize W to indentity matrix and $f(z) = rect(z) = max(z, 0)$

Bidirectional RNN: for classification we want to incorporate information from words both preceding and following

Semantic Vector Spaces: vectors representing phrases and setences that do not ignore word order and capture semantics for NLP tasks.

Standard Recursive Neural Network: paraphrase detection
Matrix-vector Recursive Neural Network: relation classification
Recursive Neural Tensor Network: sentiment analysis
Tree LSTM: phrase similarity
Recursive nerual nets require a parse to get tree structure
Recurrent neural nets cannot capture phrases without prefix context and often capture too much of last words in final vector
CNN: compute vectors for every possible phrases

Model comparison:

Bag of Vectors: surprisingly good baseline for simple classification problems. Especially if followed by a few layers.
Window Model: good for single word classification for problems that do not need wide context
CNNs: good for classification, unclear how to incorporate phrase level annotation (can only take a single label), need zero padding for shorter phrases, hard to interpret, easy to parallelize on GPUs
Recursive Neural Networks: most linguistically plausible, interpretable, provide most important phrases, need parse trees
Recurrent Neural Networks: most cognitively plausible (reading from left to right), not usually the highest classification performance but lots of improvements right now with gates.

Notes for CNN for Visual Recongnition

13 Apr 2016

DeepLearning

notes for CNN for Visual Recongnition

Course Notes

Loss function

$L = \frac{1}{N} \sum_{i=1}^N \sum_{j \neq y_i}$ max $(0, f_j(x_i, W) -f_{y_j}(x_i, W)+1)$+$\lambda R(W)$

In commom use:

L2 regularization $R(W) = \sum_k \sum_l W_{k,l}^2$
L1 regularization $R(W) = \sum_k \sum_l \vert W_{k,l} \vert$
Elastic net (L1+L2) $R(W) = \sum_k \sum_l \beta_1 W_{k,l}^2 + \beta_2 \vert W_{k,l} \vert$
Dropout: randomly set some neurons to zeros in the forward pass; force the network to have a redundant representation; need to scale at test time.

Loss function type:

Softmax $L_i = -log(\frac{e^{s_{y_j}}}{\sum_j e^{s_j}})$
SVM (hinge loss) $L_i = \sum_{j \neq y_j} max(0, s_j - s_{y_i} + 1)$

In practice: always use analytic gradient, but check implementation with numerical gradient. This is called gradient check.

Activation functions

Sigmoid $\sigma (x) = \frac{1}{(1+e^{-x})}$
- saturated neurons “kill” the gradients
- Sigmoid outputs are not zero-centered
- exp() is a bit compute expensive
tanh(x)
- squashes numbers to range [-1, 1]
- zero centered
- still kills gradients when saturated
ReLU ( Rectified Linear Unit) $f(x) = max(0, x)$
- does not saturate
- very computationally efficient
- converges much faster than sigmoid/tanh in practice
- not zero-centered output
Leaky ReLU $f(x) = max(\alpha x, x)$
- does not saturate
- computationally efficient
- converges much faster than sigmoid/tanh in practice
- will not “die”
Exponential Linear Units (ELU) $f(x) = x $ if $x > 0$; $f(x) = \alpha (exp(x) -1)$ if $x \le 0$
- all benefits of ReLU
- does not die
- closer to zero mean outputs
- computation requires exp()
Maxout “neuron” $max(w_1^T x + b_1, w_2^Tx+b_2)$
- generalize ReLu and Leaky ReLU
- linear regime, does not saturate, does not die
- doubles the number of parameters

In practice

Use ReLU. Be careful with learning rates
Try out Leaky ReLU/ Maxout/ ELU
Try out tanh but do not expect much
Do not use sigmoid

Weight initilization

Small random numbers: works okay for small networks, but can lead non-homogeneous distributions of activations across the layers of a network
Xavier initilization

Batch normalization:

Improves gradient flow through the network
Allow higher learning rates
Reduces the strong dependence on initialization
Acts as a form of regularization

Hyperparameters to play with:

network architecture
learning rate, its decay schedule, update type
regularization

Step update

Gradient descent x += -learning_rate * dx
Momentum update
- v = mu * v - learning_rate * dx # integrate velocity
- x += v # integrate position
- physical interpretation as ball rolling down the loss function + friction (mu coefficient)
- mu = usually ~0.5, 0.9, or 0.99 (or annealed over time)
Nesterov momentum update
- $ v_t = \mu v_{t-1} - \epsilon \nabla f(\theta_{t-1} + \mu \v_{t-1}) $
- $ \theta t = \theta{t-1} + v_t$
- use variable tansform
  - $ v_t = \mu v_{t-1} - \epsilon \nabla f(\phi)$
  - $\phi_t = \phi_{t-1} - \mu v_{t-1} + (1+\mu) v_t$
- code
  - v_pre = v
  - v = mu * v -learning_rate * dx
  - x += -mu * v_prev + (1 + mu) * v
AdaGrad update
- cache += dx**2
- x += - learning_rate * dx / (np.sqrt(cache) + 1e-7)
- added element-wise scaling of the gradient based on the historical sum of squares in each dimension
RMSProp update
- cache = decay_rate * cache + (1 - decay_rate) * dx**2
- x += - learning_rate * dx / (np.sqrt(cache) + 1e-7)
Adam update
- m = beta1 * m + (1-beta1) * dx # update first moment
- v = beta2 * v + (1-beta2) * (dx**2) # update second moment
- mb = m/(1-beta1**t) # correct bias
- vb = v/(1-beta2**t) # correct bias
- x += -learning_rate * mb / (np.sqrt(vb) + 1e-7

Learning rate decay over time:

step decay: e.g. decay learning rate by half every few epochs
exponential decay: $\alpha = \alpha_0 e^{-kt}$
1/t decay: $\alpha = \alpha_0 / (1+kt)$

Second order optimization methods:

second order Taylor: $J(\theta) = J(\theta_0) + (\theta-\theta_0)^T \nabla _{\theta}J(\theta_0) + \frac{1}{2} (\theta - \theta_0) ^T H (\theta - \theta_0)$
solve for critical point: $\hat{\theta} = \theta_0 - H^{-1} \nabla_{\theta} J(\theta_0)$
Quasi-Newton methods (e.g. BGFS): instead of inverting the Hessian, approximate inverse Hessian with rank 1 updates over time
L-BFGS (Limited memory BFGS): does not form/store the full inverse Hessian
- usually works very well in full batch, deterministic code
- does not transfer very well to mini-batch setting

In practice:

Adam is a good default choice in most cases
Try L-BFGS if can afford to do full batch updates

Convolutional Neural Network (CNN)

ConvNet is a sequence of Convolution Layers, interspersed with activation functions.

$f(x,y)*g(x,y) = \sum_{n_1 = -\infty}^{\infty} \sum_{n2=-\infty}^{\infty} f(n_1, n_2) g(x-n_1, y-n_2)$

The Conv Layer:

Accepts a volume of size $W_1 \times H_1 \times D_1$
Requires four hyperparameters:
- Number of filters $K$,
- their spatial extent $F$,
- the stride $S$,
- the amount of zero padding $P$.
Produces a volume of size $W_2 \times H_2 \times D_2$ where:
- $W_2 = (W_1 - F + 2P)/S + 1$
- $H_2 = (H_1 - F + 2P)/S + 1$ (i.e. width and height are computed equally by symmetry)
- $D_2 = K$
With parameter sharing, it introduces $F \cdot F \cdot D_1$ weights per filter, for a total of $(F \cdot F \cdot D_1) \cdot K$ weights and $K$ biases.
In the output volume, the $d$-th depth slice (of size $W_2 \times H_2$) is the result of performing a valid convolution of the $d$-th filter over the input volume with a stride of $S$, and then offset by $d$-th bias.
Common settings: $F=3$, $S=1$, $P=1$.

Pooling layer:

make the representations smaller and more manageable
operates over each activation map independently

Generally, the pooling layer:

Accepts a volume of size $W_1 \times H_1 \times D_1$
Requires three hyperparameters:
- their spatial extent $F$,
- the stride $S$,
Produces a volume of size $W_2 \times H_2 \times D_2$ where:
- $W_2 = (W_1 - F)/S + 1$
- $H_2 = (H_1 - F)/S + 1$
- $D_2 = D_1$
Introduces zero parameters since it computes a fixed function of the input
Note that it is not common to use zero-padding for Pooling layers
Common setting: $F = 2, S=2$; $F=3, S=2$.

How to stack convolutions:

Replace large convolutions (5x5, 7x7) with stacks of 3x3 convolutions
1x1 “bottleneck” convolutions are very efficient
Can factor NxN convolutions into 1xN and Nx1
All of the above give fewer parameters, less compute, and more nonlinearity

Convolution Theorem: The convolution of f and g is equal to the elementwise product of their Fourier Transforms: $F(f * g) = F(f)F(g)$. Using the FFT, we can compute the DFT of an N-dimension vector in O(NlogN) time.

Implement convolutions FFT:

Compute FFT of weights: F(W)
Compute FFT of image: F(X)
Compute elementwise product: F(W)F(X)
Compute inverse FFT: $Y=F^{-1}(F(W)F(X))$

Segmentation:

Semantic segmentation
- classify all pixels
- fully convolutional models, downsample then upsample
- learnable upsampling: fractionally strided convolution
- skip connections can help
Instance segmentation
- detect instance, generate mask
- similar pipelines to object detection

Attention:

Soft attention:
- easy to implement: produce distribution over input locations, reweight features and feed as input
- attend to arbitrary input locations using spatial transformer networks
Hard attention:
- attend to a single input location
- cannot use gradient descent
- need reinforcement learning

Unsupervised learning:

Autoencoders
- Traditional: feature learning, reconstruct input, not used much anymore
- Variational: generate samples, Bayesian meets deep learning
Generative adversarial networks: generate samples

Recurrent Neural Networks (RNN)

$ h_t = f_w (h_{t-1}, x_t)$, where $h_t$ is new state, $h_{t-1}$ is old state, $f_w$ is some function with parameters $w$, $x_t$ is input vector at some time step.

Vanilla RNN

Software and Packages

Caffe1

Caffe2

Torch

Theano

Tensorflow

OverView

Recommendation

Notes for Docker

10 Apr 2016

Docker

1. Docker Basics

Docker is an open standard platforms for developing, packaging, and running portable distributed applications.

What makes Docker images unique and different from other virtual machines is that while each virtual machine image runs on a separate guest OS, the Docker images run within the same OS kernel.

Install Docker with the Official Get Started with Docker

Docker cheet sheet

An instance of an image is called container.

List all images: $docker images

List all containers: $docker ps -a

Download a pre-built image (ubuntu): $docker pull ubuntu

Run a container from a specific image:

$ sudo docker run -i -t <image_id || repository:tag> /bin/bash

Start a existed container:

$ sudo docker start -i <image_id>

Attach a running container:

$ sudo docker attach <container_id>

Exit without shutting down a container:

[Ctrl-p] + [Ctrl-q]

Start a new container:

$ JOB=$(docker run -d ubuntu /bin/sh -c "while true; do echo Hello world; sleep 1; done")

Stop the container: $ docker stop $JOB

Start the container: $ docker start $JOB

Restart the container: $ docker restart $JOB

Kill a container: $ docker kill $JOB

Remove a container:

$ docker stop $JOB # Container must be stopped to remove it $ docker rm $JOB or $ docker rm <container_id>

Remove an image:

$ docker rmi <image_id> # container must be removed first

Search for images command:$ docker search <image_name>

Pull image: $ docker pull <image_name>

Commit your container to a new named image:

# run a container
$ docker run -it <image_name> /bin/bash
# make some changes in the container, then exit.
# commit a copy of this container to an image.
$ docker commit -m "changes content" <container_id> <new_image_name>
# -m is for the commit message

Build an image:

# create a Dockerfile like the following:
FROM ubuntu:14.04
MAINTAINER Kate Smith <ksmith@example.com>
RUN apt-get update && apt-get install -y ruby ruby-dev
RUN gem install sinatra

$ docker build - t <user>/<image_name>:<tag> .

Docker command flags:

-d: run in background
-P: map any require network
-i: interactive

2. Docker for Hadoop

The image I used for launch Hadoop is Hadoop-docker. Following the instruction on the readme file, you can launch and test the Hadoop platfrom pretty easily.

To use the image: $ docker run -it sequenceiq/hadoop-docker:2.7.1 /etc/bootstrap.sh -bash

3. Docker for Spark

Docker-spark

To use the image: $ docker run -d -h sandbox sequenceiq/spark:1.6.0 -d

Older Newer

notebook This is my personal notebook ^_^

Notes for virtualenv

Notes for virtualenv

Notes for Data Analysis with R at Udacity

Notes for Data Analysis with R at Udacity

R basics

Explore one variable

Explore two variables

Explor many variables

References

Notes for Deep Learning for NLP

Notes for Deep Learning for NLP

Notes for CNN for Visual Recongnition

notes for CNN for Visual Recongnition

Loss function

Activation functions

Weight initilization

Step update

Convolutional Neural Network (CNN)

Recurrent Neural Networks (RNN)

Software and Packages

Notes for Docker

1. Docker Basics

2. Docker for Hadoop

3. Docker for Spark