Downloading and decompressing sequencing data from NCBI
Introduction
Several data are shared in NCBI website, so knowning the methods for downloading data is necessary, especialy for people working on data mining.
Code exmaple
Perfetch data
After we downloaded SRR_Acc_List.txt from ncbi data, we can use prefetch to download sra file which were listed in SRR_Acc_List.txt.
- Dowload accession list file, install sratoolkit from ncbi website
- Use prefetch in sratoolkit to download all sra file in
./prefetch --option-file /yjdata/data/Candidemia/SRR_Acc_List.txt -O /yjdata/data/Candidemia/sra/ # Continue download ./prefetch --option-file /yjdata/data/Candidemia/SRR_Acc_List.txt -O /yjdata/data/Candidemia/sra/ -r yes
fasterq dump
In the past, we uncompressed sra to fastq by fastq-dump. But now fastq-dump is replaced by fasterq-dump, which are multi-threaded operation and more faster.
# Prepare codes
less Healthy_RunTable.txt | awk 'BEGIN{FS = ",";OFS = "\t"}{print "echo "$1";""/yjdata/hxzk/utilities/sratoolkit.3.0.0-centos_linux64/bin/fasterq-dump --split-3 -O /yjdata/data/Candidemia/fq/ /yjdata/data/Candidemia/sra/"$1"/"$1".sra"}'
STAR
STAR is used to map fastq data to reference genome and GTF
# Build STAR reference index
STAR --runThreadN 20 --runMode genomeGenerate \
--genomeDir /home/hxzk/download/star/GRCh38_Ensembl99/ \
--genomeFastaFiles /home/hxzk/download/star/GRCh38_Ensembl99/Homo_sapiens.GRCh38.dna.primary_assembly.fa \
--sjdbGTFfile /home/hxzk/download/star/GRCh38_Ensembl99/Homo_sapiens.GRCh38.99.gtf \
--sjdbOverhang 100
Write a bash script for one sample
# star.sh
hg=$1
fq1=$2
fq2=$3
outprefix=$4
mkdir $outFileNamePrefix
STAR --runThreadN 20 --genomeDir $hg \
--runMode alignReads \
--quantMode TranscriptomeSAM GeneCounts \
--readFilesIn $fq1 $fq2 \
--outSAMtype BAM SortedByCoordinate \
--outFileNamePrefix $outprefix"/"
Write ta bash script for all sample
# Prepared code
less sample_table/Candidemia_Healthy_RunTable.txt | awk 'BEGIN{FS = ","}{print "bash star.sh /home/hxzk/download/star/GRCh38_Ensembl99/ /yjdata/data/Candidemia/fq/"$1"_1.fastq /yjdata/data/Candidemia/fq/"$1"_2.fastq ~/project/Candidemia/result/bam/"$1"/"}'
RSEM
build rsem index
# build RSEM references
/software/RSEM/usr/local/bin/rsem-prepare-reference --gtf /home/hxzk/download/star/GRCh38_Ensembl99/Homo_sapiens.GRCh38.99.gtf \
-p 10 \
--bowtie2 --bowtie2-path /software/bowtie2-2.4.2-sra-linux-x86_64/ \
/home/hxzk/download/star/GRCh38_Ensembl99/Homo_sapiens.GRCh38.dna.primary_assembly.fa \
/home/hxzk/download/RSEM_db/ref/human_refseq
Write a bash script for one sample
# rsem.sh
bamfile=$1
fa=$2
outprefix=$3
#/software/RSEM/usr/local/bin/rsem-calculate-expression --forward-prob 0.5 --paired-end -p 8 --bam $bamfile $fa $outprefix"/"
/software/RSEM/usr/local/bin/rsem-calculate-expression --paired-end \
--alignments \
-p 20 \
$bamfile \
$fa \
$outprefix
Write ta bash script for all sample
# Prepared code
less ~/project/Candidemia/result/sample_table/Candidemia_Healthy_RunTable.txt | awk 'BEGIN{FS = ","}{print "bash rsem.sh /home/hxzk/project/Candidemia/result/bam/"$1"/Aligned.toTranscriptome.out.bam /home/hxzk/download/RSEM_db/ref/human_refseq /home/hxzk/project/Candidemia/result/rsem/"$1}'
DEseq2
Collate single sample data into a table
#!/usr/bin/Rscript
library(data.table)
library(DESeq2)
setwd("/home/hxzk/project/Candidemia/")
# Read id, symbol data
id_symbol <- fread("~/project/Candidemia/data/genes.tsv", sep = " ", header = F)
id_symbol <- id_symbol[!duplicated(V1),]
# Read all rsem data from directory
dt_lst <- list()
for(f in dir("result/rsem", pattern = "*.genes.results")){
tmp_dt <- fread(paste0("result/rsem/",f))
Sample <- strsplit(f, split = ".", fixed = T)[[1]][1]
tmp_dt$Sample <- Sample
dt_lst[[Sample]] <- tmp_dt
}
dt <- rbindlist(dt_lst)
# Transform to expression matrix
expr_dt <- dt[,.(gene_id, expected_count, Sample)]
dcast_dt <- dcast(expr_dt, gene_id ~ Sample, value.var = "expected_count")
saveRDS(dcast_dt, file = "result/analysis/expr_dt.rds")
Run DESeq2
dcast_dt <- readRDS("result/analysis/expr_dt.rds")
# Read clinical from sra
sra_dt <- fread("result/sample_table/Candidemia_Healthy_RunTable.txt", header = F, sep = ",")
expr <- data.frame(dcast_dt[, 2:ncol(dcast_dt), with = F])
rownames(expr) <- dcast_dt$gene_id
# Transform gene id to gene symbol
expr <- expr[rownames(expr) %in% id_symbol$V1,]
symbol <- id_symbol[rownames(expr), V2, on = "V1"]
# Sum expression same symbol
expr_dt2 <- cbind(expr, symbol)
dt_melt <- melt(expr_dt2, id.vars = "symbol")
dcast_dt <- dcast(dt_melt, symbol ~ variable, fun.aggregate = sum)
expr <- data.frame(dcast_dt[,2:ncol(dcast_dt)])
rownames(expr) <- dcast_dt$symbol
# Match clincal to sample.
#Type is condition that we want to compare, while gender is variable that we want to control its affect.
clinical_dt <- sra_dt[,.(Sample = V1, Type = V23, Gender = V15)]
group <- clinical_dt[colnames(expr),Type, on = "Sample"]
gender <- clinical_dt[colnames(expr),Gender, on = "Sample"]
# Highly expressed genes
# Filtered low expression genes
filtered_id <- which(apply(expr, 1, function(x)length(x[x > 1])>=(0.1*ncol(expr))))
mt <- expr[filtered_id,]
mt <- round(mt, 0)
## Coldata to DESeq2
coldata <- data.table(group, gender)
coldata$group <- factor(group, levels = c("Healthy", "Candidemia"))
coldata$gender <- factor(gender, levels = c("female", "male"))
# Identification of differentially expressed genes
ddsFullCountTable <- DESeqDataSetFromMatrix(countData = mt,
colData = coldata, design= ~ gender + group)
dds <- DESeq(ddsFullCountTable)
res <- results(dds)
cutoff <- 0.05
res05 <- results(dds, alpha=cutoff)
write.table(res05, file = "result/pvalue_result.txt", sep = "\t", col.names = T, quote = F)
Database
Some files can be downloaded from follows:
STAR and RSEM referece genome: https://ftp.ensembl.org/pub/release-99/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz
STAR and RSEM GTF:
https://ftp.ensembl.org/pub/release-99/gtf/homo_sapiens/Homo_sapiens.GRCh38.99.gtf.gz
id_symbole table:
Link:https://pan.baidu.com/s/1Kebn7A6bm5_0XUUvQJSXtA
Extraction code:az7l
References
The manual of STAR
NCBIwebsite: https://www.ncbi.nlm.nih.gov/
生信螺丝钉:https://zhuanlan.zhihu.com/p/360792123
DESeq2 manual: https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html