<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
  <channel>
    <title>GEN242 – Assignments</title>
    <link>/assignments/</link>
    <description>Recent content in Assignments on GEN242</description>
    <generator>Hugo -- gohugo.io</generator>
    
	  <atom:link href="/assignments/index.xml" rel="self" type="application/rss+xml" />
    
    
      
        
      
    
    
    <item>
      <title>Assignments: </title>
      <link>/assignments/projects/helper_code/aligners/star_test/</link>
      <pubDate>Mon, 01 Jan 0001 00:00:00 +0000</pubDate>
      
      <guid>/assignments/projects/helper_code/aligners/star_test/</guid>
      <description>
        
        
        &lt;p&gt;#############&lt;/p&gt;
&lt;h4 id=&#34;star&#34;&gt;STAR&lt;/h4&gt;
&lt;p&gt;#############&lt;/p&gt;
&lt;h3 id=&#34;read-mapping-with-star&#34;&gt;Read mapping with &lt;code&gt;STAR&lt;/code&gt;&lt;/h3&gt;
&lt;pre&gt;&lt;code class=&#34;language-r&#34;&gt;library(systemPipeR)
&lt;/code&gt;&lt;/pre&gt;
&lt;pre&gt;&lt;code&gt;## Loading required package: Rsamtools
&lt;/code&gt;&lt;/pre&gt;
&lt;pre&gt;&lt;code&gt;## Loading required package: GenomeInfoDb
&lt;/code&gt;&lt;/pre&gt;
&lt;pre&gt;&lt;code&gt;## Loading required package: BiocGenerics
&lt;/code&gt;&lt;/pre&gt;
&lt;pre&gt;&lt;code&gt;## 
## Attaching package: &#39;BiocGenerics&#39;
&lt;/code&gt;&lt;/pre&gt;
&lt;pre&gt;&lt;code&gt;## The following objects are masked from &#39;package:stats&#39;:
## 
##     IQR, mad, sd, var, xtabs
&lt;/code&gt;&lt;/pre&gt;
&lt;pre&gt;&lt;code&gt;## The following objects are masked from &#39;package:base&#39;:
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, saveRDS, setdiff,
##     table, tapply, union, unique, unsplit, which.max, which.min
&lt;/code&gt;&lt;/pre&gt;
&lt;pre&gt;&lt;code&gt;## Loading required package: S4Vectors
&lt;/code&gt;&lt;/pre&gt;
&lt;pre&gt;&lt;code&gt;## Loading required package: stats4
&lt;/code&gt;&lt;/pre&gt;
&lt;pre&gt;&lt;code&gt;## 
## Attaching package: &#39;S4Vectors&#39;
&lt;/code&gt;&lt;/pre&gt;
&lt;pre&gt;&lt;code&gt;## The following object is masked from &#39;package:utils&#39;:
## 
##     findMatches
&lt;/code&gt;&lt;/pre&gt;
&lt;pre&gt;&lt;code&gt;## The following objects are masked from &#39;package:base&#39;:
## 
##     expand.grid, I, unname
&lt;/code&gt;&lt;/pre&gt;
&lt;pre&gt;&lt;code&gt;## Loading required package: IRanges
&lt;/code&gt;&lt;/pre&gt;
&lt;pre&gt;&lt;code&gt;## Loading required package: GenomicRanges
&lt;/code&gt;&lt;/pre&gt;
&lt;pre&gt;&lt;code&gt;## Loading required package: Biostrings
&lt;/code&gt;&lt;/pre&gt;
&lt;pre&gt;&lt;code&gt;## Loading required package: XVector
&lt;/code&gt;&lt;/pre&gt;
&lt;pre&gt;&lt;code&gt;## 
## Attaching package: &#39;Biostrings&#39;
&lt;/code&gt;&lt;/pre&gt;
&lt;pre&gt;&lt;code&gt;## The following object is masked from &#39;package:base&#39;:
## 
##     strsplit
&lt;/code&gt;&lt;/pre&gt;
&lt;pre&gt;&lt;code&gt;## Loading required package: ShortRead
&lt;/code&gt;&lt;/pre&gt;
&lt;pre&gt;&lt;code&gt;## Loading required package: BiocParallel
&lt;/code&gt;&lt;/pre&gt;
&lt;pre&gt;&lt;code&gt;## Loading required package: GenomicAlignments
&lt;/code&gt;&lt;/pre&gt;
&lt;pre&gt;&lt;code&gt;## Loading required package: SummarizedExperiment
&lt;/code&gt;&lt;/pre&gt;
&lt;pre&gt;&lt;code&gt;## Loading required package: MatrixGenerics
&lt;/code&gt;&lt;/pre&gt;
&lt;pre&gt;&lt;code&gt;## Loading required package: matrixStats
&lt;/code&gt;&lt;/pre&gt;
&lt;pre&gt;&lt;code&gt;## 
## Attaching package: &#39;MatrixGenerics&#39;
&lt;/code&gt;&lt;/pre&gt;
&lt;pre&gt;&lt;code&gt;## The following objects are masked from &#39;package:matrixStats&#39;:
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
&lt;/code&gt;&lt;/pre&gt;
&lt;pre&gt;&lt;code&gt;## Loading required package: Biobase
&lt;/code&gt;&lt;/pre&gt;
&lt;pre&gt;&lt;code&gt;## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     &#39;browseVignettes()&#39;. To cite Bioconductor, see
##     &#39;citation(&amp;quot;Biobase&amp;quot;)&#39;, and for packages &#39;citation(&amp;quot;pkgname&amp;quot;)&#39;.
&lt;/code&gt;&lt;/pre&gt;
&lt;pre&gt;&lt;code&gt;## 
## Attaching package: &#39;Biobase&#39;
&lt;/code&gt;&lt;/pre&gt;
&lt;pre&gt;&lt;code&gt;## The following object is masked from &#39;package:MatrixGenerics&#39;:
## 
##     rowMedians
&lt;/code&gt;&lt;/pre&gt;
&lt;pre&gt;&lt;code&gt;## The following objects are masked from &#39;package:matrixStats&#39;:
## 
##     anyMissing, rowMedians
&lt;/code&gt;&lt;/pre&gt;
&lt;pre&gt;&lt;code class=&#34;language-r&#34;&gt;# sal &amp;lt;- SPRproject(logs.dir= &amp;quot;.SPRproject_test&amp;quot;) # use this line when .SPRproject_test doesn&#39;t exist yet
sal &amp;lt;- SPRproject(overwrite = TRUE, logs.dir= &amp;quot;.SPRproject_test&amp;quot;)
&lt;/code&gt;&lt;/pre&gt;
&lt;pre&gt;&lt;code&gt;## Recreating directory &#39;/home/tgirke/tmp/GEN242/content/en/assignments/Projects/helper_code/aligners/.SPRproject_test&#39;
## Creating file &#39;/home/tgirke/tmp/GEN242/content/en/assignments/Projects/helper_code/aligners/.SPRproject_test/SYSargsList.yml&#39;
&lt;/code&gt;&lt;/pre&gt;
&lt;h2 id=&#34;use-gtf-file-instead-of-gff&#34;&gt;Use GTF file instead of GFF&lt;/h2&gt;
&lt;p&gt;Download GTF from Arabidopsis from here. This is for proper read counting with STAR.&lt;/p&gt;
&lt;pre&gt;&lt;code class=&#34;language-r&#34;&gt;download.file(&amp;quot;https://ftp.ensemblgenomes.ebi.ac.uk/pub/plants/release-59/gtf/arabidopsis_thaliana/Arabidopsis_thaliana.TAIR10.59.gtf.gz&amp;quot;, &amp;quot;data/tair10.gtf.gz&amp;quot;)
R.utils::gunzip(&amp;quot;data/tair10.gtf.gz&amp;quot;, overwrite=TRUE)
dna &amp;lt;- readDNAStringSet(&amp;quot;./data/tair10.fasta&amp;quot;)
names(dna) &amp;lt;- c(as.character(c(1:5)), &amp;quot;Mt&amp;quot;, &amp;quot;Pt&amp;quot;) # Fixes chromomse ids, where numbers are used by ENSEMBL, and toy data set uses: Chr1, Chr2, ...
writeXStringSet(dna, &amp;quot;./data/tair10.fasta&amp;quot;)
&lt;/code&gt;&lt;/pre&gt;
&lt;pre&gt;&lt;code class=&#34;language-r&#34;&gt;appendStep(sal) &amp;lt;- LineWise(code = {
                library(systemPipeR)
                }, step_name = &amp;quot;load_SPR&amp;quot;)
&lt;/code&gt;&lt;/pre&gt;
&lt;h2 id=&#34;read-preprocessing&#34;&gt;Read preprocessing&lt;/h2&gt;
&lt;h3 id=&#34;preprocessing-with-preprocessreads-function&#34;&gt;Preprocessing with &lt;code&gt;preprocessReads&lt;/code&gt; function&lt;/h3&gt;
&lt;pre&gt;&lt;code class=&#34;language-r&#34;&gt;appendStep(sal) &amp;lt;- SYSargsList(
    step_name = &amp;quot;preprocessing&amp;quot;,
    targets = &amp;quot;targetsPE.txt&amp;quot;, dir = TRUE,
    wf_file = &amp;quot;preprocessReads/preprocessReads-pe.cwl&amp;quot;,
    input_file = &amp;quot;preprocessReads/preprocessReads-pe.yml&amp;quot;,
    dir_path = &amp;quot;param/cwl&amp;quot;,
    inputvars = c(
        FileName1 = &amp;quot;_FASTQ_PATH1_&amp;quot;,
        FileName2 = &amp;quot;_FASTQ_PATH2_&amp;quot;,
        SampleName = &amp;quot;_SampleName_&amp;quot;
    ),
    dependency = c(&amp;quot;load_SPR&amp;quot;))
&lt;/code&gt;&lt;/pre&gt;
&lt;h2 id=&#34;alignments-with-star&#34;&gt;Alignments with &lt;code&gt;STAR&lt;/code&gt;&lt;/h2&gt;
&lt;h3 id=&#34;star-indexing&#34;&gt;&lt;code&gt;STAR&lt;/code&gt; Indexing&lt;/h3&gt;
&lt;pre&gt;&lt;code class=&#34;language-r&#34;&gt;appendStep(sal) &amp;lt;- SYSargsList(
    step_name = &amp;quot;star_index&amp;quot;, 
    dir = FALSE, 
    targets=NULL, 
    wf_file = &amp;quot;star/star-index.cwl&amp;quot;, 
    input_file=&amp;quot;star/star-index.yml&amp;quot;,
    dir_path=&amp;quot;param/cwl&amp;quot;, 
    dependency = &amp;quot;load_SPR&amp;quot;
)
&lt;/code&gt;&lt;/pre&gt;
&lt;h3 id=&#34;star-mapping&#34;&gt;&lt;code&gt;STAR&lt;/code&gt; Mapping&lt;/h3&gt;
&lt;pre&gt;&lt;code class=&#34;language-r&#34;&gt;appendStep(sal) &amp;lt;- SYSargsList(
    step_name = &amp;quot;star_mapping&amp;quot;, 
    dir = TRUE, 
    targets = &amp;quot;preprocessing&amp;quot;,
    wf_file = &amp;quot;star/star-mapping-pe.cwl&amp;quot;, 
    input_file = &amp;quot;star/star-mapping-pe.yml&amp;quot;, 
    dir_path = &amp;quot;param/cwl&amp;quot;,
    inputvars = c(preprocessReads_1 = &amp;quot;_FASTQ_PATH1_&amp;quot;, preprocessReads_2 = &amp;quot;_FASTQ_PATH2_&amp;quot;,
        SampleName = &amp;quot;_SampleName_&amp;quot;), rm_targets_col = c(&amp;quot;FileName1&amp;quot;, &amp;quot;FileName2&amp;quot;),
    dependency = c(&amp;quot;preprocessing&amp;quot;, &amp;quot;star_index&amp;quot;))
&lt;/code&gt;&lt;/pre&gt;
&lt;pre&gt;&lt;code class=&#34;language-r&#34;&gt;## Return command-line calls for STAR
cmdlist(sal, step=&amp;quot;star_mapping&amp;quot;, targets=1)

## BAM outpaths required for read counting below
outpaths &amp;lt;- getColumn(sal, step = &amp;quot;star_mapping&amp;quot;, &amp;quot;outfiles&amp;quot;, column = &amp;quot;Aligned_toTranscriptome_out_bam&amp;quot;)
file.exists(outpaths) # Will not return TRUE until STAR completed sucessfully
&lt;/code&gt;&lt;/pre&gt;
&lt;pre&gt;&lt;code class=&#34;language-r&#34;&gt;## To run sal stepwise, make sure you have constructed your 
## sal object step-by-step starting from an empty sal
## as shown above under chunk: intialize sal for testing 
sal &amp;lt;- runWF(sal, steps=c(1)) # increment step number one by one just for checking
sal
outpaths &amp;lt;- getColumn(sal, step = &amp;quot;star_mapping&amp;quot;, &amp;quot;outfiles&amp;quot;, column = &amp;quot;Aligned_toTranscriptome_out_bam&amp;quot;)
outpaths
file.exists(outpaths) # Will not return TRUE until STAR completed sucessfully
&lt;/code&gt;&lt;/pre&gt;
&lt;pre&gt;&lt;code class=&#34;language-r&#34;&gt;## The following can be used for setting up things initial testing
starPE &amp;lt;- loadWorkflow(targets = &amp;quot;targetsPE.txt&amp;quot;, wf_file = &amp;quot;star-mapping-pe.cwl&amp;quot;, 
                       input_file = &amp;quot;star-mapping-pe.yml&amp;quot;, dir_path = &amp;quot;./param/star_test&amp;quot;)
starPE &amp;lt;- renderWF(starPE, inputvars = c(FileName1 = &amp;quot;_FASTQ_PATH1_&amp;quot;, FileName2 = &amp;quot;_FASTQ_PATH2_&amp;quot;, 
                                         SampleName = &amp;quot;_SampleName_&amp;quot;))
cmdlist(starPE)
runCommandline(starPE, make_bam = FALSE)
&lt;/code&gt;&lt;/pre&gt;
&lt;h3 id=&#34;assemble-read-count-matrix&#34;&gt;Assemble read count matrix&lt;/h3&gt;
&lt;pre&gt;&lt;code class=&#34;language-r&#34;&gt;readcounts &amp;lt;- getColumn(sal, step = &amp;quot;star_mapping&amp;quot;, &amp;quot;outfiles&amp;quot;, column = &amp;quot;ReadsPerGene_out_tab&amp;quot;)
assembleCountMA &amp;lt;- function(x) {
    df &amp;lt;- read.delim(x, row.names=1)
    colnames(df) &amp;lt;- paste(rep(gsub(&amp;quot;\\..*&amp;quot;, &amp;quot;&amp;quot;, basename(x)), 3), c(&amp;quot;both&amp;quot;, &amp;quot;sense&amp;quot;, &amp;quot;antisense&amp;quot;), sep=&amp;quot;_&amp;quot;) 
    df[!rownames(df) %in% c(&amp;quot;N_multimapping&amp;quot;, &amp;quot;N_noFeature&amp;quot;, &amp;quot;N_ambiguous&amp;quot;),]
}
countList &amp;lt;- lapply(readcounts, function(z) assembleCountMA(x=z))
names(countList) &amp;lt;- NULL
## Check whether if row order is identical among all matrices
# all(sapply(names(countList), function(x) row.names(countList[[1]]) %in% row.names(countList[[x]])))
countDFstar &amp;lt;- do.call(cbind, countList)
write.table(countDFstar, &amp;quot;results/countDFstar.xls&amp;quot;, col.names = NA, quote = FALSE, sep = &amp;quot;\t&amp;quot;)
&lt;/code&gt;&lt;/pre&gt;

      </description>
    </item>
    
  </channel>
</rss>
