2015年5月21日 星期四

SNP calling using samtools

edit bash file
$open .bash_profile

copy file to server share folder
$rsync -avz "filename" chinstrap:/store/share/Andes

1. download sequence data (in .sra form)
2. convert .sra to .fastq using sratoool kit
$ fastq-dump --split-files SRR11511287

$ /Users/andesli/Desktop/sratoolkit.2.5.0-1-mac64/bin/fastq-dump SRR1151287 (download)
/Users/andesli/Desktop/sratoolkit.2.5.0-1-mac64/bin/fastq-dump -A /Users/andesli/Desktop/bifido_seq_data/test/hi_seq_breve_jcm1192/SRR1151287.sra (convert)

flow diagram:

step1 : Align
(follow link http://ged.msu.edu/angus/tutorials-2011/bwa_tutorial.html)

2014年10月22日 星期三

R script (ggplot2)

#data format

df<-data.frame(trt=factor(c(1,1,2,2)),resp=c(1,5,3,4),group=factor(c(1,2,1,2)),se=c(0.1,0.3,0.3,0.2))

#example
    trt resp group  se
1   1    1      1      0.1
2   1    5      2      0.3
3   2    3      1      0.3
4   2    4      2      0.2

#choose specific row from data

df2<-df[c(1,3),]

#example
    trt resp group  se
1   1    1      1      0.1
3   2    3      1      0.3

#define function summarySE (sd,se,ci)
## Summarizes data.
## Gives count, mean, standard deviation, standard error of the mean, and confidence interval (default 95%).
##   data: a data frame.
##   measurevar: the name of a column that contains the variable to be summariezed
##   groupvars: a vector containing names of columns that contain grouping variables
##   na.rm: a boolean that indicates whether to ignore NA's
##   conf.interval: the percent range of the confidence interval (default is 95%)
summarySE <- function(data=NULL, measurevar, groupvars=NULL, na.rm=FALSE,
                      conf.interval=.95, .drop=TRUE) {
    require(plyr)

    # New version of length which can handle NA's: if na.rm==T, don't count them
    length2 <- function (x, na.rm=FALSE) {
        if (na.rm) sum(!is.na(x))
        else       length(x)
    }

    # This does the summary. For each group's data frame, return a vector with
    # N, mean, and sd
    datac <- ddply(data, groupvars, .drop=.drop,
      .fun = function(xx, col) {
        c(N    = length2(xx[[col]], na.rm=na.rm),
          mean = mean   (xx[[col]], na.rm=na.rm),
          sd   = sd     (xx[[col]], na.rm=na.rm)
        )
      },
      measurevar
    )

    # Rename the "mean" column    
    datac <- rename(datac, c("mean" = measurevar))

    datac$se <- datac$sd / sqrt(datac$N)  # Calculate standard error of the mean

    # Confidence interval multiplier for standard error
    # Calculate t-statistic for confidence interval: 
    # e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1
    ciMult <- qt(conf.interval/2 + .5, datac$N-1)
    datac$ci <- datac$se * ciMult

    return(datac)
}

2014年9月24日 星期三

R script (grofit)

#before everything starts, type this script/

library(grofit)


#open 2 R console in terminal
open -n -a R.app

#print working directory
getwd()



Workflow

#Define option (e.g. interactive = FALSE means manually choose each growth curve

#
grofit.control(neg.nan.act = FALSE, clean.bootstrap = TRUE,
               suppress.messages = FALSE, fit.opt = "b",
               log.x.gc = FALSE, log.y.gc = FALSE,
               interactive = TRUE, nboot.gc = 0,
               smooth.gc= NULL, model.type=c("logistic",
               "richards","gompertz", "gompertz.exp"),
               have.atleast = 6, parameter = 9, smooth.dr = NULL,
               log.x.dr = FALSE, log.y.dr = FALSE, nboot.dr = 0)

MyOpt1 <- grofit.control(fit.opt="b",interactive = FALSE,log.y.gc=TRUE,smooth.gc=0.5)

#use specific model

MyOpt2 <- grofit.control(fit.opt="m",model.type=c("gompertz"),interactive = FALSE,log.y.gc=TRUE,smooth.gc=0.5)

#read data from csv file

data<-read.table ("/Users/andesli/Desktop/20140926_Mutantbglucan_merged_log.csv",header=FALSE,sep=",")

#Set time

timepoints<-1:44
OR
#Set time from 0.5 to 22, interval = 0.5
timepoint2<-seq(0.5,22, by=0.5)

#Input time into a matrix form

time<-matrix(rep(timepoints,86),nrow=86,ncol=44,byrow=TRUE)

#Fit data with grofit model and FALSE mean turn off dose response curve function/
TestRun1 <- grofit(time, data, FALSE, MyOpt1)
#plot(TestRun1) is not working
OR
TestRun1<-gcFit(time,data,control=MyOpt1)

Plot Graph

#set a new pop up window

#make an empty chart
plot(1,1,xlim=c(0,42),ylim=c(0,1),type="n",ann=FALSE)
title(main="Mutant_1",xlab="Time (hours)",ylab="log(1+OD at 600nm)")

dev.new(width=8,height=3)
par(mfrow=c(1,3),mar=c(5.1,4.1,3.1,1.1))

#define dot color

colData<-c("black","cyan","magenta","green","blue","orange","grey","red","darkgreen","pink")

#pch = dot shape, slope = TURE-> tangent, add = whether plot in the existing window
#opt=m => plot model, s => spline

plot(TestRun1, add=FALSE, opt="m", pch=1:10, cex=1,colModel=1,colData=colData, slope=TRUE)

#plot specific curve in large data

plot(TestRun1$gcFittedModel[[no. of row], add=FALSE, opt="m", pch=1, cex=1,colModel=1,colData=1, slope=TRUE)
OR
#for data cannot fit model
plot(TestRun$gcFittedSpline[[no. of row]], add=FALSE, opt="s", pch=1, cex=1,colSpline=1,colData=1, slope=FALSE,raw=TRUE)

# call out legend

legend("topleft",c(TestRun_log$gcFittedModel[[no. of row]]$gcID))
legend("topright",c("RCM","beta glucan"),pch=c(1,2),col=c(1,2))

#plot single curve with best fitted curve and tagent

plot(TestRun_log$gcFittedModel[[no. of row]]$raw.time,TestRun_log$gcFittedModel[[no. of row]]$raw.data,xlab="Time (hours)",ylab="log(1+OD (nm600))")
plot(plot(TestRun_log$gcFittedModel[[no. of row]],add=TURE)

#plot multiple curve at the same time and save

for (i in 1:10)
{
pdf(paste("/Users/andesli/Desktop/Mutant_",b,".pdf")
+ plot(1,1,xlim=c(0,42),ylim=c(0,1),type="n",ann=FALSE)
+ title(main=paste("Mutant ",i),xlab="Time (hours)",ylab="log(1+OD at 600nm)")
+ legend("topright",c("RCM","beta glucan"),pch=c(1,2),col=c(1,2))
+ plot(bifido_merged$gcFittedModel[[i+111]],add=TRUE,pch=1,cex=1,colModel=1,colData=1,slope=TRUE,raw=TRUE)
+ plot(bifido_merged$gcFittedModel[[i]],add=TRUE,pch=2,cex=1,colModel=2,colData=2,slope=TRUE,raw=TRUE)
dev.off()
}



#Export data to a .txt

sink(file = "/Users/andesli/Desktop/TestRun1.txt")
TestRun1
sink()

#Run R in terminal with script, output in same directory

R CMD batch "filename"

Reference link
#Generic plot function for gcFit objects
http://rgm3.lab.nig.ac.jp/RGM/R_rdfile?f=grofit/man/plot.gcFit.Rd&d=R_CC

#pch shape chart
http://www.harding.edu/fmccown/r/

#Scatterplots tutorial
http://www.statmethods.net/graphs/scatterplot.html