Knowledge base

Downloading and decompressing sequencing data from NCBI

Introduction

Several data are shared in NCBI website, so knowning the methods for downloading data is necessary, especialy for people working on data mining.

Code exmaple

Perfetch data

After we downloaded SRR_Acc_List.txt from ncbi data, we can use prefetch to download sra file which were listed in SRR_Acc_List.txt.

  1. Dowload accession list file, install sratoolkit from ncbi website
  2. Use prefetch in sratoolkit to download all sra file in
    ./prefetch --option-file  /yjdata/data/Candidemia/SRR_Acc_List.txt -O /yjdata/data/Candidemia/sra/
    # Continue download
    ./prefetch --option-file  /yjdata/data/Candidemia/SRR_Acc_List.txt -O /yjdata/data/Candidemia/sra/ -r yes
fasterq dump

In the past, we uncompressed sra to fastq by fastq-dump. But now fastq-dump is replaced by fasterq-dump, which are multi-threaded operation and more faster.

# Prepare codes
less Healthy_RunTable.txt | awk 'BEGIN{FS = ",";OFS = "\t"}{print "echo "$1";""/yjdata/hxzk/utilities/sratoolkit.3.0.0-centos_linux64/bin/fasterq-dump --split-3 -O /yjdata/data/Candidemia/fq/ /yjdata/data/Candidemia/sra/"$1"/"$1".sra"}'
STAR

STAR is used to map fastq data to reference genome and GTF

# Build STAR reference index
STAR --runThreadN 20 --runMode genomeGenerate \
 --genomeDir /home/hxzk/download/star/GRCh38_Ensembl99/ \
 --genomeFastaFiles /home/hxzk/download/star/GRCh38_Ensembl99/Homo_sapiens.GRCh38.dna.primary_assembly.fa \
 --sjdbGTFfile /home/hxzk/download/star/GRCh38_Ensembl99/Homo_sapiens.GRCh38.99.gtf \
 --sjdbOverhang 100

Write a bash script for one sample

# star.sh
hg=$1
fq1=$2
fq2=$3
outprefix=$4
mkdir $outFileNamePrefix
STAR --runThreadN 20 --genomeDir $hg \
 --runMode alignReads \
 --quantMode TranscriptomeSAM GeneCounts \
 --readFilesIn  $fq1 $fq2 \
 --outSAMtype BAM SortedByCoordinate \
 --outFileNamePrefix $outprefix"/"

Write ta bash script for all sample

# Prepared code
less sample_table/Candidemia_Healthy_RunTable.txt | awk 'BEGIN{FS = ","}{print "bash star.sh /home/hxzk/download/star/GRCh38_Ensembl99/ /yjdata/data/Candidemia/fq/"$1"_1.fastq /yjdata/data/Candidemia/fq/"$1"_2.fastq ~/project/Candidemia/result/bam/"$1"/"}'
RSEM

build rsem index

# build RSEM references
/software/RSEM/usr/local/bin/rsem-prepare-reference --gtf /home/hxzk/download/star/GRCh38_Ensembl99/Homo_sapiens.GRCh38.99.gtf \
                       -p 10 \
                       --bowtie2 --bowtie2-path /software/bowtie2-2.4.2-sra-linux-x86_64/ \
                        /home/hxzk/download/star/GRCh38_Ensembl99/Homo_sapiens.GRCh38.dna.primary_assembly.fa \
                       /home/hxzk/download/RSEM_db/ref/human_refseq

Write a bash script for one sample

# rsem.sh
bamfile=$1
fa=$2
outprefix=$3
#/software/RSEM/usr/local/bin/rsem-calculate-expression --forward-prob 0.5 --paired-end -p 8 --bam $bamfile $fa $outprefix"/"
/software/RSEM/usr/local/bin/rsem-calculate-expression --paired-end \
                               --alignments \
                               -p 20 \
                               $bamfile \
                               $fa \
                               $outprefix

Write ta bash script for all sample

# Prepared code
less ~/project/Candidemia/result/sample_table/Candidemia_Healthy_RunTable.txt | awk 'BEGIN{FS = ","}{print "bash rsem.sh /home/hxzk/project/Candidemia/result/bam/"$1"/Aligned.toTranscriptome.out.bam /home/hxzk/download/RSEM_db/ref/human_refseq /home/hxzk/project/Candidemia/result/rsem/"$1}'
DEseq2

Collate single sample data into a table

#!/usr/bin/Rscript
library(data.table)
library(DESeq2)
setwd("/home/hxzk/project/Candidemia/")
# Read id, symbol data
id_symbol <-  fread("~/project/Candidemia/data/genes.tsv", sep = " ", header = F)
id_symbol <- id_symbol[!duplicated(V1),]
# Read all rsem data from directory
dt_lst <- list()
for(f in dir("result/rsem", pattern = "*.genes.results")){
   tmp_dt <- fread(paste0("result/rsem/",f))
   Sample <- strsplit(f, split = ".", fixed  = T)[[1]][1]
   tmp_dt$Sample <- Sample
   dt_lst[[Sample]] <- tmp_dt
}
dt <- rbindlist(dt_lst)
# Transform to expression matrix
expr_dt <- dt[,.(gene_id, expected_count, Sample)]
dcast_dt <- dcast(expr_dt, gene_id ~ Sample, value.var = "expected_count")
saveRDS(dcast_dt, file = "result/analysis/expr_dt.rds")

Run DESeq2

dcast_dt <- readRDS("result/analysis/expr_dt.rds")
# Read clinical from sra
sra_dt <- fread("result/sample_table/Candidemia_Healthy_RunTable.txt", header = F, sep = ",")
expr <- data.frame(dcast_dt[, 2:ncol(dcast_dt), with = F])
rownames(expr) <- dcast_dt$gene_id
# Transform gene id to gene symbol
expr <- expr[rownames(expr) %in% id_symbol$V1,]
symbol <- id_symbol[rownames(expr), V2, on = "V1"]
# Sum expression same symbol
expr_dt2 <- cbind(expr, symbol)
dt_melt <- melt(expr_dt2, id.vars = "symbol")
dcast_dt <- dcast(dt_melt, symbol ~ variable, fun.aggregate = sum)
expr <- data.frame(dcast_dt[,2:ncol(dcast_dt)])
rownames(expr) <- dcast_dt$symbol
# Match clincal to sample. 
#Type is condition that we want to compare, while gender is variable that we want to control its affect.
clinical_dt <- sra_dt[,.(Sample = V1, Type = V23, Gender = V15)]
group <- clinical_dt[colnames(expr),Type, on = "Sample"]
gender <- clinical_dt[colnames(expr),Gender, on = "Sample"]
# Highly expressed genes
# Filtered low expression genes
filtered_id <- which(apply(expr, 1, function(x)length(x[x > 1])>=(0.1*ncol(expr))))
mt <- expr[filtered_id,]
mt <- round(mt, 0)
## Coldata to DESeq2
coldata <- data.table(group, gender)
coldata$group <- factor(group, levels = c("Healthy", "Candidemia"))
coldata$gender <- factor(gender,  levels = c("female", "male"))
# Identification of differentially expressed genes 
ddsFullCountTable <- DESeqDataSetFromMatrix(countData = mt,
        colData = coldata,  design= ~ gender + group)
dds <- DESeq(ddsFullCountTable)
res <- results(dds)
cutoff <- 0.05
res05 <- results(dds, alpha=cutoff)
write.table(res05, file = "result/pvalue_result.txt", sep = "\t", col.names = T, quote = F)

Database

Some files can be downloaded from follows:
STAR and RSEM referece genome: https://ftp.ensembl.org/pub/release-99/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz
STAR and RSEM GTF:
https://ftp.ensembl.org/pub/release-99/gtf/homo_sapiens/Homo_sapiens.GRCh38.99.gtf.gz
id_symbole table:
Link:https://pan.baidu.com/s/1Kebn7A6bm5_0XUUvQJSXtA
Extraction code:az7l

References

The manual of STAR
NCBIwebsite: https://www.ncbi.nlm.nih.gov/
生信螺丝钉:https://zhuanlan.zhihu.com/p/360792123
DESeq2 manual: https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html

Leave a Reply

Your email address will not be published. Required fields are marked *