Final version

This commit is contained in:
MarieMih 2023-12-18 14:31:34 +03:00
parent c4f360144d
commit 4e470308e1
10 changed files with 795 additions and 84 deletions

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

401
chromosome.R Normal file
View File

@ -0,0 +1,401 @@
library(ggplot2)
library(stringr)
#####
#This part of scripts produces the plots of alignment data. Only genome-aligned reverse reads were taken here.
#For every sample was created a set of reads of equal capacity (min between all sample reads).
#####
#Uploading data in R
temp_name<-read.table("names_of_custom_genome.txt")
samples_ecoli_chr <- c()
for (i in temp_name[,c("V1")]) {
samples_ecoli_chr<-c(samples_ecoli_chr, gsub("_genome_mapq20_table.txt", "", gsub("custom_", "", i)))
}
tp2_genome <- data.frame(matrix(ncol = 24, nrow = 0))
colnames(tp2_genome) <- unlist(strsplit("ID;Read;Count_SNPs;Cycles;GC;Mean_length_del;Mean_length_insert;Read_length;Align_length;Dels;Inserts;A>C;A>G;A>T;C>A;C>G;C>T;G>A;G>C;G>T;T>A;T>C;T>G",";"))
for (i in temp_name[,c("V1")]) {
temp<-read.csv(i, sep=";", header=TRUE)
temp<-subset(temp, Read==2)
temp<-temp[sample(nrow(temp), 500000),]
temp<-cbind(temp, "Sample"=gsub("_genome_mapq20_table.txt", "", gsub("custom_", "", i)))
tp2_genome<-rbind(tp2_genome, temp)
}
rm(temp, temp_name)
tp2_genome$Sample<-as.factor(tp2_genome$Sample)
dir.create("../report/images_new_genome")
#GC-content
ggplot(data = tp2_genome, aes(x = GC)) +
geom_density(stat = "density", fill = "pink", alpha = 0.5) +
facet_wrap(~Sample, nrow = 3) +
geom_vline(xintercept = 50, linetype="dotted") +
xlab("ГЦ-состав, %") +
ylab("Доля прочтений") +
xlim(25, 75) +
ylim(0, 0.15) +
theme(plot.title = element_text(hjust = 0.5))
ggsave("../report/images_new_genome/gc.png", width = 2250, height = 1500, units = "px")
#Mismatches per cycle of sequence
df2<-data.frame(matrix(nrow = 0, ncol = 2))
colnames(df2)<-c("Cycles", "Sample")
for (i in samples_ecoli_chr) {
cyc <-subset(tp2_genome, Sample == i)[,4]
cyc <- as.numeric(unlist(strsplit(cyc,",")))
df2<-rbind(df2, cbind("Cycles" = cyc, "Sample" =i))
}
df2$Sample<-as.factor(df2$Sample)
df2$Cycles<-as.numeric(df2$Cycles)
rm (cyc, i)
ggplot(df2, aes(x = Cycles)) +
geom_density(stat = "count", fill = "blue", alpha = 0.5) +
#geom_line(stat = "count", fill = "blue", alpha = 0.5) +
facet_wrap(~Sample, nrow = 3) +
xlab("Цикл прочтения, №")+
ylab("Количество замен")+
ggtitle("Количество однонуклеотидных замен по циклам") +
theme(plot.title = element_text(hjust = 0.5))
ggsave("../report/images_new_genome/mpc.png", width = 2250, height = 1500, units = "px")
ggplot(df2, aes(x = Cycles)) +
geom_density(stat = "density", fill = "blue", alpha = 0.5) +
facet_wrap(~Sample, nrow = 3) +
xlab("Цикл прочтения, №")+
ylab("Доля от общего количества")+
ggtitle("Количество однонуклеотидных замен по циклам") +
theme(plot.title = element_text(hjust = 0.5))
ggsave("../report/images_new_genome/mpc_density.png", width = 2250, height = 1500, units = "px")
rm (df2)
#Mismatches depending on GC-content
mgc2<-cbind(tp2_genome[, c("Count_SNPs", "Sample")], "GC" = round(tp2_genome$GC))
ggplot(mgc2, aes(x=GC, y=Count_SNPs)) +
stat_summary(fun = sum, geom="line", linewidth = 0.5) +
facet_wrap(~Sample, nrow = 3) +
xlab("ГЦ-состав, %")+
ylab("Количество однонуклеотидных замен")+
ggtitle("Количество однонуклеотидных замен в зависимости от ГЦ-состава прочтений") +
geom_vline(xintercept = 50, linetype="dotted") +
theme_bw()+
theme(plot.title = element_text(hjust = 0.5))
ggsave("../report/images_new_genome/mgc.png", width = 2250, height = 1500, units = "px")
ggplot(mgc2, aes(x=GC, y=Count_SNPs)) +
stat_summary(fun = mean, geom="line", size = 0.5) +
facet_wrap(~Sample, nrow = 3) +
xlab("ГЦ-состав, %")+
ylab("Среднее число однонуклеотидных замен на одно прочтение")+
ggtitle("Количество однонуклеотидных замен в зависимости от ГЦ-состава прочтения") +
geom_vline(xintercept = 50, linetype="dotted") +
theme_bw()+
theme(plot.title = element_text(hjust = 0.5))
ggsave("../report/images_new_genome/mgc_mean.png", width = 2250, height = 1500, units = "px")
rm (mgc2)
#Frequency of different types of mismatches
mtype2 <- data.frame(matrix(ncol = 3, nrow = 0))
colnames(mtype2)<- c("Sum", "type", "Sample")
for (i in samples_ecoli_chr) {
temp <- apply(tp2_genome[, c(12:23)][tp2_genome$Sample==i,], 2, sum)
temp <- data.frame("Sum" = temp)
temp$type<-as.factor(colnames(tp2_genome[, c(12:23)]))
temp<-cbind(temp, "Sample" = i)
mtype2<-rbind(mtype2, temp)
}
ggplot(mtype2) +
geom_line(aes(x = type, y = Sum, group = Sample)) +
geom_point(aes(x = type, y = Sum)) +
#facet_wrap(~Sample, nrow = 3, scales = "free_y") +
facet_wrap(~Sample, nrow = 3, ) +
xlab("Тип однонуклеотидной замены")+
ylab("Количество однонуклеотидных замен")+
ggtitle("Частота типов однонуклеотидных замен") +
theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1))
ggsave("../report/images_new_genome/mt.png", width = 2250, height = 1500, units = "px")
mtype2_2 <- data.frame(matrix(ncol = 3, nrow = 0))
colnames(mtype2_2)<- c("mean", "type", "Sample")
for (i in samples_ecoli_chr) {
temp <- apply(tp2_genome[tp2_genome$Sample==i, c(12:23)], 2, mean)
temp <- data.frame("mean" = temp)
temp$type<-as.factor(colnames(tp2_genome[, c(12:23)]))
temp<-cbind(temp, "Sample" = i)
mtype2_2<-rbind(mtype2_2, temp)
}
ggplot(mtype2_2) +
geom_line(aes(x = type, y = mean, group = Sample)) +
geom_point(aes(x = type, y = mean)) +
#facet_wrap(~Sample, nrow = 3, scales = "free_y") +
facet_wrap(~Sample, nrow = 3) +
xlab("Тип однонуклеотидной замены")+
ylab("Среднее по прочтению")+
ggtitle("Частота типов однонуклеотидных замен") +
theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1))
ggsave("../report/images_new_genome/mt_mean.png", width = 2250, height = 1500, units = "px")
rm(mtype2, mtype2_2)
#Number of deletions per sample
delets2 <- data.frame(matrix(ncol = 2, nrow = 0))
colnames(delets2)<- c("Nucleotide", "Sample")
for (i in samples_ecoli_chr) {
temp <- tp2_genome[tp2_genome$Sample==i,][,10]
temp <- unlist(strsplit(temp,""))
temp<- temp[temp %in% c(letters, LETTERS)]
temp<-cbind("Nucleotide"=temp, "Sample" = i)
temp<-data.frame(temp)
temp$Nucleotide <- as.factor(temp$Nucleotide)
delets2<-rbind(temp, delets2)
}
ggplot(delets2) +
geom_bar(aes(x = Nucleotide), width = 0.4, fill = "blue", alpha = 0.5, col = "black") +
facet_wrap(~Sample, nrow = 3) +
#ylim(0, 1500) +
xlab("Нуклеотид")+
ylab("Количество делеций")+
ggtitle("Состав делеций") +
theme(plot.title = element_text(hjust = 0.5))
ggsave("../report/images_new_genome/delets.png", width = 2250, height = 1500, units = "px")
rm (delets2)
#Number of insertions per sample
inserts2 <- data.frame(matrix(ncol = 2, nrow = 0))
colnames(inserts2)<- c("Nucleotide", "Sample")
for (i in samples_ecoli_chr) {
temp <- tp2_genome[tp2_genome$Sample==i,][,11]
temp <- unlist(strsplit(temp,""))
temp<- temp[temp %in% c(letters, LETTERS)]
temp<-cbind("Nucleotide"=temp, "Sample" = i)
temp<-data.frame(temp)
temp$Nucleotide <- as.factor(temp$Nucleotide)
inserts2<-rbind(temp, inserts2)
}
ggplot(inserts2) +
xlab("Нуклеотид")+
ylab("Количество вставок")+
ggtitle("Состав вставок") +
geom_bar(aes(x = Nucleotide), width = 0.4, fill = "blue", alpha = 0.5, col = "black") +
facet_wrap(~Sample, nrow = 3) +
#ylim(0, 2000) +
theme(plot.title = element_text(hjust = 0.5))
ggsave("../report/images_new_genome/inserts.png", width = 2250, height = 1500, units = "px")
rm (inserts2)
#Mean, min and max values of insertions per sample
insert_mean2 <- data.frame(matrix(ncol = 7, nrow = 0))
colnames(insert_mean2)<- c("Sample", "Min.", "1st Qu.", "Median", "Mean", "3rd Qu.", "Max.")
for (i in samples_ecoli_chr) {
temp<-subset(tp2_genome, Mean_length_del!=0 & Sample == i)[,7]
temp<-cbind("Sample"=i, t(summary(temp)))
insert_mean2<-rbind(insert_mean2, temp)
}
insert_mean2[, 2]<-as.numeric(insert_mean2[, 2])
insert_mean2[, 4]<-as.numeric(insert_mean2[, 4])
insert_mean2[, 5]<-as.numeric(insert_mean2[, 5])
insert_mean2[, 7]<-as.numeric(insert_mean2[, 7])
ggplot(data.frame(insert_mean2), aes(x = factor(Sample), y = Mean), size = 0.5) +
geom_pointrange(aes(ymin = 0, ymax = Max.)) +
geom_point(aes(x = factor(Sample), y = 0), shape = 1) +
geom_point(aes(x = factor(Sample), y = Max.), shape = 1) +
xlab("Образец")+
ylab("Количество нуклеотидов")+
ggtitle("Статистика длины вставки") +
theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1))
ggsave("../report/images_new_genome/iml.png", width = 2250, height = 1500, units = "px")
rm (insert_mean2)
#Mean, min and max values of deletions per sample
delete_mean2 <- data.frame(matrix(ncol = 7, nrow = 0))
colnames(delete_mean2)<- c("Sample", "Min.", "1st Qu.", "Median", "Mean", "3rd Qu.", "Max.")
for (i in samples_ecoli_chr) {
temp<-subset(tp2_genome, Mean_length_del!=0 & Sample == i)[,6]
temp<-cbind("Sample"=i, t(summary(temp)))
delete_mean2<-rbind(delete_mean2, temp)
}
delete_mean2[, 2]<-as.numeric(delete_mean2[, 2])
delete_mean2[, 4]<-as.numeric(delete_mean2[, 4])
delete_mean2[, 5]<-as.numeric(delete_mean2[, 5])
delete_mean2[, 7]<-as.numeric(delete_mean2[, 7])
ggplot(data.frame(delete_mean2), aes(x = factor(Sample), y = Mean), size = 0.5) +
geom_pointrange(aes(ymin = 0, ymax = Max.)) +
geom_point(aes(x = factor(Sample), y = 0), shape = 1) +
geom_point(aes(x = factor(Sample), y = Max.), shape = 1) +
xlab("Образец")+
ylab("Количество нуклеотидов")+
ggtitle("Статистика длины делеции") +
theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1))
ggsave("../report/images_new_genome/dml.png", width = 2250, height = 1500, units = "px")
rm(delete_mean2)
#Number of mismatches per sample
ggplot(tp2_genome, aes(x=Sample, y=Count_SNPs, group = 1)) +
stat_summary(fun = sum, geom="bar", width = 0.7, fill = "white", color = "black", alpha = 0.7) +
xlab("Образец")+
ylab("Количество замен")+
ggtitle("Количество однонуклеотидных замен") +
#geom_vline(xintercept = 50, linetype="dotted") +
theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1))
ggsave("../report/images_new_genome/mps_mean.png", width = 2250, height = 1500, units = "px")
ggplot(tp2_genome, aes(x=Sample, y=Count_SNPs, group = 1)) +
stat_summary(fun = mean, geom="bar", width = 0.7, fill = "white", color = "black", alpha = 0.7) +
xlab("Образец")+
ylab("Среднее число замен")+
ggtitle("Количество однонуклеотидных замен на одно прочтение") +
#geom_vline(xintercept = 50, linetype="dotted") +
theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1))
ggsave("../report/images_new_genome/mps.png", width = 2250, height = 1500, units = "px")
#Number of deletions per cycle of sequence
numbers_only <- function(x) !grepl("\\D", x)
dels2 <- data.frame(matrix(nrow = 0, ncol = 2))
colnames(dels2)<-c("Num", "Sample")
for (i in samples_ecoli_chr) {
temp <- tp2_genome[tp2_genome$Sample==i,][,10]
temp <- unlist(strsplit(temp, split="|",fixed = TRUE))
temp <- unlist(strsplit(temp,","))
temp<- temp[numbers_only(temp)]
temp<-cbind("Num"=temp, "Sample" = i)
temp<-data.frame(temp)
dels2<-rbind(temp, dels2)
}
dels2$Num<-as.numeric(dels2$Num)
ggplot(dels2, aes(x = Num)) +
#geom_density(stat = "count", alpha = 0.5) +
stat_count(geom="line", position="identity") +
stat_count(geom="point", position="identity", size = 1) +
facet_wrap(~Sample, nrow = 3) +
#ylim(0, 1500) +
xlab("Цикл прочтения, №")+
ylab("Количество делеций")+
ggtitle("Число делеций на цикл") +
theme_bw()+
theme(plot.title = element_text(hjust = 0.5))
ggsave("../report/images_new_genome/dpc.png", width = 2250, height = 1500, units = "px")
rm(dels2)
#Number of insertions per cycle of sequence
ins2 <- data.frame(matrix(nrow = 0, ncol = 2))
colnames(ins2)<-c("Num", "Sample")
for (i in samples_ecoli_chr) {
temp <- tp2_genome[tp2_genome$Sample==i,][,11]
temp <- unlist(strsplit(temp, split="|",fixed = TRUE))
temp <- unlist(strsplit(temp,","))
temp<- temp[numbers_only(temp)]
temp<-cbind("Num"=temp, "Sample" = i)
temp<-data.frame(temp)
ins2<-rbind(temp, ins2)
}
ins2$Num<-as.numeric(ins2$Num)
ggplot(ins2, aes(x = Num)) +
stat_count(geom="line", position="identity") +
stat_count(geom="point", position="identity", size = 1) +
facet_wrap(~Sample, nrow = 3) +
#ylim(0, 1500) +
xlab("Цикл прочтения, №")+
ylab("Количество вставок")+
ggtitle("Число вставок на цикл") +
scale_x_continuous(breaks=seq(0, 150, 25))+
theme_bw()+
theme(plot.title = element_text(hjust = 0.5))
ggsave("../report/images_new_genome/ipc.png", width = 2250, height = 1500, units = "px")
rm (ins2)
#Number of deletions per sample
parse_delets2 <- data.frame(matrix(nrow = 0, ncol = 2))
colnames(parse_delets2)<-c("Num", "Sample")
for (i in samples_ecoli_chr) {
temp <- tp2_genome[tp2_genome$Sample==i,][,10]
temp <- unlist(strsplit(temp,split="|",fixed = TRUE))
temp<-cbind("Num"=temp, "Sample" = i)
temp<-data.frame(temp)
parse_delets2<-rbind(temp, parse_delets2)
}
ggplot(parse_delets2, aes(x=Sample, group = 1)) +
stat_count(geom="bar", width = 0.7, fill = "white", color = "black", alpha = 0.7) +
xlab("Образец")+
ylab("Количество делеций")+
ggtitle("Число делеций на образец") +
#geom_vline(xintercept = 50, linetype="dotted") +
theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1))
ggsave("../report/images_new_genome/dps.png", width = 2250, height = 1500, units = "px")
rm (parse_delets2)
#Number of insertions per sample
parse_inserts2 <- data.frame(matrix(nrow = 0, ncol = 2))
colnames(parse_inserts2)<-c("Num", "Sample")
for (i in samples_ecoli_chr) {
temp <- tp2_genome[tp2_genome$Sample==i,][,11]
temp <- unlist(strsplit(temp,split="|",fixed = TRUE))
temp<-cbind("Num"=temp, "Sample" = i)
temp<-data.frame(temp)
parse_inserts2<-rbind(temp, parse_inserts2)
}
ggplot(parse_inserts2, aes(x=Sample, group = 1)) +
stat_count(geom="bar", width = 0.7, fill = "white", color = "black", alpha = 0.7) +
xlab("Образец")+
ylab("Количество вставок")+
ggtitle("Число вставок на образец") +
#geom_vline(xintercept = 50, linetype="dotted") +
theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1))
ggsave("../report/images_new_genome/ips.png", width = 2250, height = 1500, units = "px")
rm (parse_inserts2, temp)
#Frequency of insertion motifs per sample
insert_motifs2 <- data.frame(matrix(nrow = 0, ncol = 2))
colnames(insert_motifs2)<-c("Num", "Sample")
for (i in samples_ecoli_chr) {
temp <- tp2_genome[tp2_genome$Sample==i,][,11]
temp <- unlist(strsplit(temp, split="|",fixed = TRUE))
temp <- unlist(strsplit(temp,","))
temp<- temp[!numbers_only(temp)]
temp<-cbind("Num"=temp, "Sample" = i)
temp<-data.frame(temp)
insert_motifs2<-rbind(temp, insert_motifs2)
}
ggplot(insert_motifs2, aes(x = Num)) +
stat_count(geom='bar', width = 0.7, fill = "white", color = "black", alpha = 0.7) +
facet_wrap(~Sample, nrow=4, scales = "free_y") +
xlab("Мотив")+
ylab("Количество")+
ggtitle("Представленность мотивов вставок в образцах") +
theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1))
ggsave("../report/images_new_genome/imps.png", width = 10550, height = 2500, units = "px")
rm (insert_motifs2, temp)
#Frequency of deletion motifs per sample
delete_motifs2 <- data.frame(matrix(nrow = 0, ncol = 2))
colnames(delete_motifs2)<-c("Num", "Sample")
for (i in samples_ecoli_chr) {
temp <- tp2_genome[tp2_genome$Sample==i,][,10]
temp <- unlist(strsplit(temp, split="|",fixed = TRUE))
temp <- unlist(strsplit(temp,","))
temp<- temp[!numbers_only(temp)]
temp<-cbind("Num"=temp, "Sample" = i)
temp<-data.frame(temp)
delete_motifs2<-rbind(temp, delete_motifs2)
}
ggplot(delete_motifs2, aes(x = Num)) +
stat_count(geom='bar', width = 0.7, fill = "white", color = "black", alpha = 0.7) +
facet_wrap(~Sample, nrow=4, scales = "free_y") +
xlab("Мотив")+
ylab("Количество")+
ggtitle("Представленность мотивов делеций в образцах") +
theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1))
ggsave("../report/images_new_genome/dmps.png", width = 10550, height = 2500, units = "px")
rm (delete_motifs2, temp)

View File

@ -0,0 +1,46 @@
#!/bin/bash
source /storage/data1/marmi/miniconda/installation/etc/profile.d/conda.sh
conda activate marmi_methylome
#перед каждым запуском заполнять эти пять переменных и path_to_sample
input_file="libs_11_12_23.csv"
cell="V350165701"
path_to_run="/storage/data1/aug/V350165701_repair"
goal_dir="custom_libs_E_Coli_Zvl"
path_to_assembly="/storage/data1/marmi/dec_custom_libs"
path_to_python="/storage/data1/marmi/trial_scripts/split_assembly.py"
path_to_python_chr="/storage/data1/marmi/trial_scripts/chromosome.py"
#выкидываем заголовок и заменяем пробелы на нижние подчеркивания
cp $input_file source_csv.csv
sed -i 's/ /_/g' source_csv.csv
<source_csv.csv awk -v a_cell="${cell}" -F"," 'NR!=1{print $1 " " a_cell "_L0" $4 "_" $3 "_1.fq.gz" " " a_cell "_L0" $4 "_" $3 "_2.fq.gz" " " $2 " " $4}' >reads_fastp.txt
rm source_csv.csv
cat reads_fastp.txt | while read i || [[ -n $i ]];
do
dir=`echo $i| awk '{split($0, array); print array[1]}'`
if [ "$dir" == "$goal_dir" ]; then
fn1=`echo $i| awk '{split($0, array); print array[2]}'`
fn2=`echo $i| awk '{split($0, array); print array[3]}'`
sample=`echo $i| awk '{split($0, array); print array[4]}'`
lane=`echo $i| awk '{split($0, array); print array[5]}'`
path_to_sample="/storage/data1/marmi/dec_custom_libs_repaired_${sample}"
mkdir ${path_to_sample}
mkdir ${path_to_sample}/qc_source_files
cd ${path_to_sample}/qc_source_files
read1="${path_to_run}/L0${lane}/${fn1}"
read2="${path_to_run}/L0${lane}/${fn2}"
fastp -e 30 -w 7 --in1 ${read1} --in2 ${read2} \
--out1 custom_${sample}_1.fastq.gz --out2 custom_${sample}_2.fastq.gz \
--unpaired1 custom_${sample}_u.fastq.gz --unpaired2 custom_${sample}_u.fastq.gz \
1>fastp_custom_${sample}_output.txt 2>fastp_custom_${sample}_err.txt
fi
done
#НЕ УДАЛЯТЬ READS_FASTP.TXT
conda deactivate
conda activate marmi_telegram
telegram-send "Work_custom_union_assembly first step ends at `date`."
conda deactivate

View File

@ -0,0 +1,61 @@
#!/bin/bash
source /storage/data1/marmi/miniconda/installation/etc/profile.d/conda.sh
conda activate marmi_methylome
#перед каждым запуском заполнять эти три переменных и path_to_sample
path_to_assembly="/storage/data1/marmi/dec_custom_libs_zvl/unicycler_assembly_custom"
goal_dir="custom_libs_E_Coli_Zvl"
path_to_tables="/storage/data1/marmi/dec_custom_libs_zvl/info_tables_repaired"
mkdir /storage/data1/marmi/dec_custom_libs_zvl
path_to_python="/storage/data1/marmi/trial_scripts/info_pol_mapq_union.py"
mkdir ${path_to_tables}
touch ${path_to_tables}/names_of_custom_chromosome.txt
touch ${path_to_tables}/names_of_custom_plasmid.txt
cat reads_fastp.txt | while read i || [[ -n $i ]];
do
dir=`echo $i| awk '{split($0, array); print array[1]}'`
if [ "$dir" == "$goal_dir" ]; then
sample=`echo $i| awk '{split($0, array); print array[4]}'`
path_to_sample="/storage/data1/marmi/dec_custom_libs_repaired_${sample}"
name_of_dir="union_${sample}"
mkdir ${path_to_sample}/${name_of_dir}
cd ${path_to_sample}/${name_of_dir}
bwa mem -t 7 ${path_to_assembly}/assembly.fasta ../qc_source_files/custom_${sample}_1.fastq.gz \
../qc_source_files/custom_${sample}_2.fastq.gz >custom_${sample}_paired.sam \
2>bwa_mem_custom_${sample}_paired.log
bwa mem -t 7 ${path_to_assembly}/assembly.fasta ../qc_source_files/custom_${sample}_u.fastq.gz \
>custom_${sample}_unpaired.sam \
2>bwa_mem_custom_${sample}_unpaired.log
samtools view -Sb -o custom_${sample}_paired.bam custom_${sample}_paired.sam
samtools view -Sb -o custom_${sample}_unpaired.bam custom_${sample}_unpaired.sam
rm custom_${sample}_paired.sam
rm custom_${sample}_unpaired.sam
samtools sort -o custom_${sample}_paired_sorted.bam custom_${sample}_paired.bam
samtools sort -o custom_${sample}_unpaired_sorted.bam custom_${sample}_unpaired.bam
samtools merge -o custom_${sample}.bam \
custom_${sample}_paired_sorted.bam custom_${sample}_unpaired_sorted.bam
samtools index custom_${sample}.bam
rm custom_${sample}_paired_sorted.bam
rm custom_${sample}_unpaired_sorted.bam
rm custom_${sample}_paired.bam
rm custom_${sample}_unpaired.bam
touch custom_${sample}_genome.txt
touch custom_${sample}_plasmid.txt
cat ${path_to_assembly}/chromosome_genome.txt | while read j || [[ -n $j ]];
do
samtools view -F 4 custom_${sample}.bam $j | grep 'MD:Z' >> custom_${sample}_genome.txt
done
cat ${path_to_assembly}/chromosome_plasmid.txt | while read j || [[ -n $j ]];
do
samtools view -F 4 custom_${sample}.bam $j | grep 'MD:Z' >> custom_${sample}_plasmid.txt
done
python3 ${path_to_python} ${sample}_genome ${path_to_sample}/${name_of_dir}/ ${path_to_tables}/
python3 ${path_to_python} ${sample}_plasmid ${path_to_sample}/${name_of_dir}/ ${path_to_tables}/
echo "custom_${sample}_genome_mapq20_table.txt" >>${path_to_tables}/names_of_custom_chromosome.txt
echo "custom_${sample}_plasmid_mapq20_table.txt" >>${path_to_tables}/names_of_custom_plasmid.txt
fi
done
conda deactivate
conda activate marmi_telegram
telegram-send "Work_custom_union_align second step ends at `date`."
conda deactivate

170
info_pol_mapq_union.py Normal file
View File

@ -0,0 +1,170 @@
# -*- coding: utf-8 -*-
import sys
import re
def GC(buf):
gc = buf.upper().count('G') + buf.upper().count('C')
all = buf.upper().count('A') + buf.upper().count('T') + gc
return (gc / all * 100)
def len_align(buf):
"""
нужно передавать CIGAR
"""
let = re.split('[0-9]+', buf)
let.remove("")
num = re.split('[MIDSH]{1}', buf)
num.remove("")
num = list(map(int, num))
length = 0
length_read = 0
for i in range(len(let)):
if (let[i]=="M") or (let[i]=="I"):
length+=num[i]
length_read+=num[i]
elif ((let[i]=="S") or (let[i]=="H")):
length_read+=num[i]
return (length, length_read)
def info (buf, arg = "I"):
"""
Передавать CIGAR
"""
let = re.split('[0-9]+', buf)
let.remove("")
num = re.split('[MIDSH]{1}', buf)
num.remove("")
num = list(map(int, num))
count = 0
for i in range(len(let)):
if (let[i]==arg):
count+=num[i]
return count
def info_insert (buf):
"""
Передавать CIGAR, возращает позиции инсерции, включая левый символ и не включая правый символ
"""
let = re.split('[0-9]+', buf)
let.remove("")
num = re.split('[MIDSH]{1}', buf)
num.remove("")
num = list(map(int, num))
start = 0
intervals = []
for i in range(len(let)):
if (let[i]=="I"):
intervals.append([start, start+num[i]])
if (let[i]=="D") or (let[i]=="H"):
continue
start+=num[i]
return tuple(intervals)
def removal (buf):
"""
Передавать CIGAR, возращает позиции удаления, включая левый символ и не включая правый символ
"""
let = re.split('[0-9]+', buf)
let.remove("")
num = re.split('[MIDSH]{1}', buf)
num.remove("")
num = list(map(int, num))
start = 0
intervals = []
for i in range(len(let)):
if (let[i]=="I") or (let[i]=="S"):
intervals.append([start, start+num[i]])
if (let[i]=="D") or (let[i]=="H"):
continue
start+=num[i]
return tuple(intervals)
def type_snp_text (seq, buf, cigar):
positions_ins = info_insert(cigar)
inserts_text = ''
inserts = []
for i in positions_ins:
inserts.append((seq[i[0]:i[1]], i[0]))
inserts_text = inserts_text + str(seq[i[0]:i[1]]) + "," + str(i[0]) + '|'
inserts_text = inserts_text[:-1]
gc_per = GC(seq)
positions = removal(cigar)
for i in positions:
if (i[1]<(len(seq))):
seq = seq[:i[0]] + "_"*(i[1]-i[0]) + seq[i[1]:]
else:
seq = seq[:i[0]] + "_"*(i[1]-i[0])
seq = seq.replace("_", "")
buf = buf.replace("MD:Z:", "")
types = {"A>C":0, "A>G":0, "A>T":0, "C>A":0, "C>G":0, "C>T":0, "G>A":0, "G>C":0, "G>T":0, "T>A":0, "T>C":0, "T>G":0}
cycles_text = ''
dels_text = ''
cycles = []
dels = []
let = re.split('[0-9]+', buf)
let.pop(0)
num = re.split('[ATGC^]+', buf)
num = list(map(int, num))
iter = 0
for i in range(len(num)):
iter+=num[i]
if (let[i] == '') or (let[i] == '\n'):
continue
if (let[i][0] == '^'):
dels_text = dels_text + str(let[i][1:]) + "," + str(iter) + "|"
dels.append((let[i][1:], iter))
continue
if (seq[iter]=="A") or (seq[iter]=="C") or (seq[iter]=="G") or (seq[iter]=="T"):
key = let[i] + ">" + seq[iter]
types[key]+=1
cycles.append(iter)
cycles_text = cycles_text + str(iter) + ","
iter+=1
else:
iter+=1
l = len_align(cigar)
dels_text = dels_text[:-1]
cycles_text = cycles_text[:-1]
if len(dels)!=0:
len_del = sum([len(dels[i][0]) for i in range(len(dels))])/len(dels)
else:
len_del = 0
if len(inserts)!=0:
len_ins = sum([len(inserts[i][0]) for i in range(len(inserts))])/len(inserts)
else:
len_ins = 0
result = {"SNP": types, "Cycles": cycles_text, "Dels": dels_text, "Inserts" : inserts_text, "GC": gc_per, "Count_SNPs": sum(types.values()), "Mean_length_del": len_del, \
"Mean_length_insert": len_ins, "Read_length": l[1], "Align_length": l[0]}
return result
with open(sys.argv[3]+"custom_"+sys.argv[1]+"_mapq20_table.txt", 'w') as table:
header = "ID;Read;Count_SNPs;Cycles;GC;Mean_length_del;Mean_length_insert;Read_length;Align_length;Dels;Inserts;A>C;A>G;A>T;C>A;C>G;C>T;G>A;G>C;G>T;T>A;T>C;T>G\n"
table.write(header)
with open(sys.argv[2]+"custom_"+sys.argv[1]+".txt", 'r') as inp:
for line in inp:
columns = line.split("\t")
if int(columns[4])>20:
id = columns[0]
flag = columns[1]
binary = "{0:b}".format(int(flag))
if (binary[-1]=='0'):
typ = 0
elif len(binary)==7:
typ = 1
elif len(binary)>=8:
if (binary[-7]=='0') and (binary[-8]=='1'):
typ = 2
else:
typ = -1
else:
typ = -1
cigar = columns[5]
md = columns[12]
seq = columns[9]
inf = type_snp_text(seq, md, cigar)
stroke = id + ";" + str(typ) + ";" + str(inf['Count_SNPs']) + ";" + inf['Cycles'] + ";" + str(inf['GC']) + ";" + str(inf['Mean_length_del']) + ";" + str(inf['Mean_length_insert']) + ";" + str(inf['Read_length']) + ";" + str(inf['Align_length']) + ";" + inf['Dels'] + ";" + inf['Inserts'] + ";" + \
str(inf['SNP']['A>C']) + ";" + str(inf['SNP']['A>G']) + ";" + str(inf['SNP']['A>T']) + ";" + str(inf['SNP']['C>A']) + ";" + str(inf['SNP']['C>G']) + ";" + str(inf['SNP']['C>T']) + ";" + str(inf['SNP']['G>A']) + ";" + str(inf['SNP']['G>C']) + ";" + str(inf['SNP']['G>T']) + ";" + str(inf['SNP']['T>A']) + ";" + str(inf['SNP']['T>C']) + ";" + str(inf['SNP']['T>G']) + '\n'
table.write(stroke)

201
plasmid.R
View File

@ -8,29 +8,31 @@ library(stringr)
#Uploading data in R #Uploading data in R
temp_name<-read.table("names_of_custom_plasmid.txt") temp_name<-read.table("names_of_custom_plasmid.txt")
samples <- c() samples_ecoli <- c()
for (i in temp_name[,c("V1")]) { for (i in temp_name[,c("V1")]) {
samples<-c(samples, gsub("_plasmid_mapq20_table.txt", "", gsub("custom_", "", i))) samples_ecoli<-c(samples_ecoli, gsub("_plasmid_mapq20_table.txt", "", gsub("custom_", "", i)))
} }
tp2 <- data.frame(matrix(ncol = 24, nrow = 0)) tp2_plasmid <- data.frame(matrix(ncol = 24, nrow = 0))
colnames(tp2) <- unlist(strsplit("ID;Read;Count_SNPs;Cycles;GC;Mean_length_del;Mean_length_insert;Read_length;Align_length;Dels;Inserts;A>C;A>G;A>T;C>A;C>G;C>T;G>A;G>C;G>T;T>A;T>C;T>G",";")) colnames(tp2_plasmid) <- unlist(strsplit("ID;Read;Count_SNPs;Cycles;GC;Mean_length_del;Mean_length_insert;Read_length;Align_length;Dels;Inserts;A>C;A>G;A>T;C>A;C>G;C>T;G>A;G>C;G>T;T>A;T>C;T>G",";"))
for (i in temp_name[,c("V1")]) { for (i in temp_name[,c("V1")]) {
temp<-read.csv(i, sep=";", header=TRUE) temp<-read.csv(i, sep=";", header=TRUE)
temp<-subset(temp, Read==2) temp<-subset(temp, Read==2)
temp<-temp[sample(nrow(temp), 200000),] temp<-temp[sample(nrow(temp), 100000),]
temp<-cbind(temp, "Sample"=gsub("_plasmid_mapq20_table.txt", "", gsub("custom_", "", i))) temp<-cbind(temp, "Sample"=gsub("_plasmid_mapq20_table.txt", "", gsub("custom_", "", i)))
tp2<-rbind(tp2, temp) tp2_plasmid<-rbind(tp2_plasmid, temp)
} }
rm(temp, temp_name) rm(temp, temp_name)
tp2$Sample<-as.factor(tp2$Sample) tp2_plasmid$Sample<-as.factor(tp2_plasmid$Sample)
dir.create("../report/images_new_plasmid") dir.create("../report/images_new_plasmid")
#GC-content #GC-content
ggplot(data = tp2, aes(x = GC)) + ggplot(data = tp2_plasmid, aes(x = GC)) +
geom_density(stat = "density", fill = "pink", alpha = 0.5) + geom_density(stat = "density", fill = "pink", alpha = 0.5) +
facet_wrap(~Sample, nrow = 2) + facet_wrap(~Sample, nrow = 3) +
geom_vline(xintercept = 50, linetype="dotted") + geom_vline(xintercept = 50, linetype="dotted") +
xlab("ГЦ-состав, %") +
ylab("Доля прочтений") +
xlim(25, 75) + xlim(25, 75) +
ylim(0, 0.15) + ylim(0, 0.15) +
theme(plot.title = element_text(hjust = 0.5)) theme(plot.title = element_text(hjust = 0.5))
@ -40,8 +42,8 @@ ggsave("../report/images_new_plasmid/gc.png", width = 2250, height = 1500, units
df2<-data.frame(matrix(nrow = 0, ncol = 2)) df2<-data.frame(matrix(nrow = 0, ncol = 2))
colnames(df2)<-c("Cycles", "Sample") colnames(df2)<-c("Cycles", "Sample")
for (i in samples) { for (i in samples_ecoli) {
cyc <-subset(tp2, Sample == i)[,4] cyc <-subset(tp2_plasmid, Sample == i)[,4]
cyc <- as.numeric(unlist(strsplit(cyc,","))) cyc <- as.numeric(unlist(strsplit(cyc,",")))
df2<-rbind(df2, cbind("Cycles" = cyc, "Sample" =i)) df2<-rbind(df2, cbind("Cycles" = cyc, "Sample" =i))
} }
@ -51,33 +53,41 @@ rm (cyc, i)
ggplot(df2, aes(x = Cycles)) + ggplot(df2, aes(x = Cycles)) +
geom_density(stat = "count", fill = "blue", alpha = 0.5) + geom_density(stat = "count", fill = "blue", alpha = 0.5) +
#geom_line(stat = "count", fill = "blue", alpha = 0.5) + #geom_line(stat = "count", fill = "blue", alpha = 0.5) +
facet_wrap(~Sample, nrow = 2) + facet_wrap(~Sample, nrow = 3) +
ggtitle("Mismatches per cycle") + xlab("Цикл прочтения, №")+
ylab("Количество замен")+
ggtitle("Количество однонуклеотидных замен по циклам") +
theme(plot.title = element_text(hjust = 0.5)) theme(plot.title = element_text(hjust = 0.5))
ggsave("../report/images_new_plasmid/mpc.png", width = 2250, height = 1500, units = "px") ggsave("../report/images_new_plasmid/mpc.png", width = 2250, height = 1500, units = "px")
ggplot(df2, aes(x = Cycles)) + ggplot(df2, aes(x = Cycles)) +
geom_density(stat = "density", fill = "blue", alpha = 0.5) + geom_density(stat = "density", fill = "blue", alpha = 0.5) +
facet_wrap(~Sample, nrow = 2) + facet_wrap(~Sample, nrow = 3) +
ggtitle("Mismatches per cycle") + xlab("Цикл прочтения, №")+
ylab("Доля от общего количества")+
ggtitle("Количество однонуклеотидных замен по циклам") +
theme(plot.title = element_text(hjust = 0.5)) theme(plot.title = element_text(hjust = 0.5))
ggsave("../report/images_new_plasmid/mpc_density.png", width = 2250, height = 1500, units = "px") ggsave("../report/images_new_plasmid/mpc_density.png", width = 2250, height = 1500, units = "px")
rm (df2) rm (df2)
#Mismatches depending on GC-content #Mismatches depending on GC-content
mgc2<-cbind(tp2[, c("Count_SNPs", "Sample")], "GC" = round(tp2$GC)) mgc2<-cbind(tp2_plasmid[, c("Count_SNPs", "Sample")], "GC" = round(tp2_plasmid$GC))
ggplot(mgc2, aes(x=GC, y=Count_SNPs)) + ggplot(mgc2, aes(x=GC, y=Count_SNPs)) +
stat_summary(fun = sum, geom="line", linewidth = 0.5) + stat_summary(fun = sum, geom="line", linewidth = 0.5) +
facet_wrap(~Sample, nrow = 2) + facet_wrap(~Sample, nrow = 3) +
ggtitle("Mismatches/GC") + xlab("ГЦ-состав, %")+
ylab("Количество однонуклеотидных замен")+
ggtitle("Количество однонуклеотидных замен в зависимости от ГЦ-состава прочтений") +
geom_vline(xintercept = 50, linetype="dotted") + geom_vline(xintercept = 50, linetype="dotted") +
theme_bw()+ theme_bw()+
theme(plot.title = element_text(hjust = 0.5)) theme(plot.title = element_text(hjust = 0.5))
ggsave("../report/images_new_plasmid/mgc.png", width = 2250, height = 1500, units = "px") ggsave("../report/images_new_plasmid/mgc.png", width = 2250, height = 1500, units = "px")
ggplot(mgc2, aes(x=GC, y=Count_SNPs)) + ggplot(mgc2, aes(x=GC, y=Count_SNPs)) +
stat_summary(fun = mean, geom="line", size = 0.5) + stat_summary(fun = mean, geom="line", size = 0.5) +
facet_wrap(~Sample, nrow = 2) + facet_wrap(~Sample, nrow = 3) +
ggtitle("Mismatches/GC") + xlab("ГЦ-состав, %")+
ylab("Среднее число однонуклеотидных замен на одно прочтение")+
ggtitle("Количество однонуклеотидных замен в зависимости от ГЦ-состава прочтения") +
geom_vline(xintercept = 50, linetype="dotted") + geom_vline(xintercept = 50, linetype="dotted") +
theme_bw()+ theme_bw()+
theme(plot.title = element_text(hjust = 0.5)) theme(plot.title = element_text(hjust = 0.5))
@ -88,37 +98,41 @@ rm (mgc2)
mtype2 <- data.frame(matrix(ncol = 3, nrow = 0)) mtype2 <- data.frame(matrix(ncol = 3, nrow = 0))
colnames(mtype2)<- c("Sum", "type", "Sample") colnames(mtype2)<- c("Sum", "type", "Sample")
for (i in samples) { for (i in samples_ecoli) {
temp <- apply(tp2[, c(12:23)][tp2$Sample==i,], 2, sum) temp <- apply(tp2_plasmid[, c(12:23)][tp2_plasmid$Sample==i,], 2, sum)
temp <- data.frame("Sum" = temp) temp <- data.frame("Sum" = temp)
temp$type<-as.factor(colnames(tp2[, c(12:23)])) temp$type<-as.factor(colnames(tp2_plasmid[, c(12:23)]))
temp<-cbind(temp, "Sample" = i) temp<-cbind(temp, "Sample" = i)
mtype2<-rbind(mtype2, temp) mtype2<-rbind(mtype2, temp)
} }
ggplot(mtype2) + ggplot(mtype2) +
geom_line(aes(x = type, y = Sum, group = Sample)) + geom_line(aes(x = type, y = Sum, group = Sample)) +
geom_point(aes(x = type, y = Sum)) + geom_point(aes(x = type, y = Sum)) +
#facet_wrap(~Sample, nrow = 2, scales = "free_y") + #facet_wrap(~Sample, nrow = 3, scales = "free_y") +
facet_wrap(~Sample, nrow = 2, ) + facet_wrap(~Sample, nrow = 3, ) +
ggtitle("Mismatch type") + xlab("Тип однонуклеотидной замены")+
ylab("Количество однонуклеотидных замен")+
ggtitle("Частота типов однонуклеотидных замен") +
theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1)) theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1))
ggsave("../report/images_new_plasmid/mt.png", width = 2250, height = 1500, units = "px") ggsave("../report/images_new_plasmid/mt.png", width = 2250, height = 1500, units = "px")
mtype2_2 <- data.frame(matrix(ncol = 3, nrow = 0)) mtype2_2 <- data.frame(matrix(ncol = 3, nrow = 0))
colnames(mtype2_2)<- c("mean", "type", "Sample") colnames(mtype2_2)<- c("mean", "type", "Sample")
for (i in samples) { for (i in samples_ecoli) {
temp <- apply(tp2[tp2$Sample==i, c(12:23)], 2, mean) temp <- apply(tp2_plasmid[tp2_plasmid$Sample==i, c(12:23)], 2, mean)
temp <- data.frame("mean" = temp) temp <- data.frame("mean" = temp)
temp$type<-as.factor(colnames(tp2[, c(12:23)])) temp$type<-as.factor(colnames(tp2_plasmid[, c(12:23)]))
temp<-cbind(temp, "Sample" = i) temp<-cbind(temp, "Sample" = i)
mtype2_2<-rbind(mtype2_2, temp) mtype2_2<-rbind(mtype2_2, temp)
} }
ggplot(mtype2_2) + ggplot(mtype2_2) +
geom_line(aes(x = type, y = mean, group = Sample)) + geom_line(aes(x = type, y = mean, group = Sample)) +
geom_point(aes(x = type, y = mean)) + geom_point(aes(x = type, y = mean)) +
#facet_wrap(~Sample, nrow = 2, scales = "free_y") + #facet_wrap(~Sample, nrow = 3, scales = "free_y") +
facet_wrap(~Sample, nrow = 2) + facet_wrap(~Sample, nrow = 3) +
ggtitle("Mismatch type") + xlab("Тип однонуклеотидной замены")+
ylab("Среднее по прочтению")+
ggtitle("Частота типов однонуклеотидных замен") +
theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1)) theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1))
ggsave("../report/images_new_plasmid/mt_mean.png", width = 2250, height = 1500, units = "px") ggsave("../report/images_new_plasmid/mt_mean.png", width = 2250, height = 1500, units = "px")
rm(mtype2, mtype2_2) rm(mtype2, mtype2_2)
@ -127,8 +141,8 @@ rm(mtype2, mtype2_2)
delets2 <- data.frame(matrix(ncol = 2, nrow = 0)) delets2 <- data.frame(matrix(ncol = 2, nrow = 0))
colnames(delets2)<- c("Nucleotide", "Sample") colnames(delets2)<- c("Nucleotide", "Sample")
for (i in samples) { for (i in samples_ecoli) {
temp <- tp2[tp2$Sample==i,][,10] temp <- tp2_plasmid[tp2_plasmid$Sample==i,][,10]
temp <- unlist(strsplit(temp,"")) temp <- unlist(strsplit(temp,""))
temp<- temp[temp %in% c(letters, LETTERS)] temp<- temp[temp %in% c(letters, LETTERS)]
temp<-cbind("Nucleotide"=temp, "Sample" = i) temp<-cbind("Nucleotide"=temp, "Sample" = i)
@ -138,9 +152,11 @@ for (i in samples) {
} }
ggplot(delets2) + ggplot(delets2) +
geom_bar(aes(x = Nucleotide), width = 0.4, fill = "blue", alpha = 0.5, col = "black") + geom_bar(aes(x = Nucleotide), width = 0.4, fill = "blue", alpha = 0.5, col = "black") +
facet_wrap(~Sample, nrow = 2) + facet_wrap(~Sample, nrow = 3) +
#ylim(0, 1500) + #ylim(0, 1500) +
ggtitle ("Deletions") + xlab("Нуклеотид")+
ylab("Количество делеций")+
ggtitle("Состав делеций") +
theme(plot.title = element_text(hjust = 0.5)) theme(plot.title = element_text(hjust = 0.5))
ggsave("../report/images_new_plasmid/delets.png", width = 2250, height = 1500, units = "px") ggsave("../report/images_new_plasmid/delets.png", width = 2250, height = 1500, units = "px")
rm (delets2) rm (delets2)
@ -149,8 +165,8 @@ rm (delets2)
inserts2 <- data.frame(matrix(ncol = 2, nrow = 0)) inserts2 <- data.frame(matrix(ncol = 2, nrow = 0))
colnames(inserts2)<- c("Nucleotide", "Sample") colnames(inserts2)<- c("Nucleotide", "Sample")
for (i in samples) { for (i in samples_ecoli) {
temp <- tp2[tp2$Sample==i,][,11] temp <- tp2_plasmid[tp2_plasmid$Sample==i,][,11]
temp <- unlist(strsplit(temp,"")) temp <- unlist(strsplit(temp,""))
temp<- temp[temp %in% c(letters, LETTERS)] temp<- temp[temp %in% c(letters, LETTERS)]
temp<-cbind("Nucleotide"=temp, "Sample" = i) temp<-cbind("Nucleotide"=temp, "Sample" = i)
@ -159,9 +175,11 @@ for (i in samples) {
inserts2<-rbind(temp, inserts2) inserts2<-rbind(temp, inserts2)
} }
ggplot(inserts2) + ggplot(inserts2) +
ggtitle("Insertions") + xlab("Нуклеотид")+
ylab("Количество вставок")+
ggtitle("Состав вставок") +
geom_bar(aes(x = Nucleotide), width = 0.4, fill = "blue", alpha = 0.5, col = "black") + geom_bar(aes(x = Nucleotide), width = 0.4, fill = "blue", alpha = 0.5, col = "black") +
facet_wrap(~Sample, nrow = 2) + facet_wrap(~Sample, nrow = 3) +
#ylim(0, 2000) + #ylim(0, 2000) +
theme(plot.title = element_text(hjust = 0.5)) theme(plot.title = element_text(hjust = 0.5))
ggsave("../report/images_new_plasmid/inserts.png", width = 2250, height = 1500, units = "px") ggsave("../report/images_new_plasmid/inserts.png", width = 2250, height = 1500, units = "px")
@ -171,8 +189,8 @@ rm (inserts2)
insert_mean2 <- data.frame(matrix(ncol = 7, nrow = 0)) insert_mean2 <- data.frame(matrix(ncol = 7, nrow = 0))
colnames(insert_mean2)<- c("Sample", "Min.", "1st Qu.", "Median", "Mean", "3rd Qu.", "Max.") colnames(insert_mean2)<- c("Sample", "Min.", "1st Qu.", "Median", "Mean", "3rd Qu.", "Max.")
for (i in samples) { for (i in samples_ecoli) {
temp<-subset(tp2, Mean_length_del!=0 & Sample == i)[,7] temp<-subset(tp2_plasmid, Mean_length_del!=0 & Sample == i)[,7]
temp<-cbind("Sample"=i, t(summary(temp))) temp<-cbind("Sample"=i, t(summary(temp)))
insert_mean2<-rbind(insert_mean2, temp) insert_mean2<-rbind(insert_mean2, temp)
} }
@ -182,12 +200,12 @@ insert_mean2[, 5]<-as.numeric(insert_mean2[, 5])
insert_mean2[, 7]<-as.numeric(insert_mean2[, 7]) insert_mean2[, 7]<-as.numeric(insert_mean2[, 7])
ggplot(data.frame(insert_mean2), aes(x = factor(Sample), y = Mean), size = 0.5) + ggplot(data.frame(insert_mean2), aes(x = factor(Sample), y = Mean), size = 0.5) +
geom_pointrange(aes(ymin = 0, ymax = Max.)) + geom_pointrange(aes(ymin = 0, ymax = Max.)) +
scale_y_continuous(name = "Length") +
geom_point(aes(x = factor(Sample), y = 0), shape = 1) + geom_point(aes(x = factor(Sample), y = 0), shape = 1) +
geom_point(aes(x = factor(Sample), y = Max.), shape = 1) + geom_point(aes(x = factor(Sample), y = Max.), shape = 1) +
ggtitle("Insertion mean length") + xlab("Образец")+
scale_x_discrete(name = "Sample") + ylab("Количество нуклеотидов")+
theme(plot.title = element_text(hjust = 0.5)) ggtitle("Статистика длины вставки") +
theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1))
ggsave("../report/images_new_plasmid/iml.png", width = 2250, height = 1500, units = "px") ggsave("../report/images_new_plasmid/iml.png", width = 2250, height = 1500, units = "px")
rm (insert_mean2) rm (insert_mean2)
@ -195,8 +213,8 @@ rm (insert_mean2)
delete_mean2 <- data.frame(matrix(ncol = 7, nrow = 0)) delete_mean2 <- data.frame(matrix(ncol = 7, nrow = 0))
colnames(delete_mean2)<- c("Sample", "Min.", "1st Qu.", "Median", "Mean", "3rd Qu.", "Max.") colnames(delete_mean2)<- c("Sample", "Min.", "1st Qu.", "Median", "Mean", "3rd Qu.", "Max.")
for (i in samples) { for (i in samples_ecoli) {
temp<-subset(tp2, Mean_length_del!=0 & Sample == i)[,6] temp<-subset(tp2_plasmid, Mean_length_del!=0 & Sample == i)[,6]
temp<-cbind("Sample"=i, t(summary(temp))) temp<-cbind("Sample"=i, t(summary(temp)))
delete_mean2<-rbind(delete_mean2, temp) delete_mean2<-rbind(delete_mean2, temp)
} }
@ -208,29 +226,32 @@ ggplot(data.frame(delete_mean2), aes(x = factor(Sample), y = Mean), size = 0.5)
geom_pointrange(aes(ymin = 0, ymax = Max.)) + geom_pointrange(aes(ymin = 0, ymax = Max.)) +
geom_point(aes(x = factor(Sample), y = 0), shape = 1) + geom_point(aes(x = factor(Sample), y = 0), shape = 1) +
geom_point(aes(x = factor(Sample), y = Max.), shape = 1) + geom_point(aes(x = factor(Sample), y = Max.), shape = 1) +
ggtitle("Deletion mean length") + xlab("Образец")+
scale_x_discrete(name = "Sample") + ylab("Количество нуклеотидов")+
scale_y_continuous(name = "Length") + ggtitle("Статистика длины делеции") +
theme(plot.title = element_text(hjust = 0.5)) theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1))
ggsave("../report/images_new_plasmid/dml.png", width = 2250, height = 1500, units = "px") ggsave("../report/images_new_plasmid/dml.png", width = 2250, height = 1500, units = "px")
rm(delete_mean2) rm(delete_mean2)
#Number of mismatches per sample #Number of mismatches per sample
ggplot(tp2, aes(x=Sample, y=Count_SNPs, group = 1)) + ggplot(tp2_plasmid, aes(x=Sample, y=Count_SNPs, group = 1)) +
stat_summary(fun = sum, geom="bar", width = 0.7, fill = "white", color = "black", alpha = 0.7) + stat_summary(fun = sum, geom="bar", width = 0.7, fill = "white", color = "black", alpha = 0.7) +
ggtitle("Mismatches per sample") + xlab("Образец")+
ylab("Количество замен")+
ggtitle("Количество однонуклеотидных замен") +
#geom_vline(xintercept = 50, linetype="dotted") + #geom_vline(xintercept = 50, linetype="dotted") +
theme(plot.title = element_text(hjust = 0.5)) theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1))
ggsave("../report/images_new_plasmid/mps_mean.png", width = 2250, height = 1500, units = "px") ggsave("../report/images_new_plasmid/mps_mean.png", width = 2250, height = 1500, units = "px")
ggplot(tp2, aes(x=Sample, y=Count_SNPs, group = 1)) + ggplot(tp2_plasmid, aes(x=Sample, y=Count_SNPs, group = 1)) +
stat_summary(fun = mean, geom="point") + stat_summary(fun = mean, geom="bar", width = 0.7, fill = "white", color = "black", alpha = 0.7) +
stat_summary(fun = mean, geom="line") + xlab("Образец")+
ggtitle("Mismatches per sample") + ylab("Среднее число замен")+
ggtitle("Количество однонуклеотидных замен на одно прочтение") +
#geom_vline(xintercept = 50, linetype="dotted") + #geom_vline(xintercept = 50, linetype="dotted") +
theme(plot.title = element_text(hjust = 0.5)) theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1))
ggsave("../report/images_new_plasmid/mps.png", width = 2250, height = 1500, units = "px") ggsave("../report/images_new_plasmid/mps.png", width = 2250, height = 1500, units = "px")
#Number of deletions per cycle of sequence #Number of deletions per cycle of sequence
@ -239,8 +260,8 @@ numbers_only <- function(x) !grepl("\\D", x)
dels2 <- data.frame(matrix(nrow = 0, ncol = 2)) dels2 <- data.frame(matrix(nrow = 0, ncol = 2))
colnames(dels2)<-c("Num", "Sample") colnames(dels2)<-c("Num", "Sample")
for (i in samples) { for (i in samples_ecoli) {
temp <- tp2[tp2$Sample==i,][,10] temp <- tp2_plasmid[tp2_plasmid$Sample==i,][,10]
temp <- unlist(strsplit(temp, split="|",fixed = TRUE)) temp <- unlist(strsplit(temp, split="|",fixed = TRUE))
temp <- unlist(strsplit(temp,",")) temp <- unlist(strsplit(temp,","))
temp<- temp[numbers_only(temp)] temp<- temp[numbers_only(temp)]
@ -253,9 +274,11 @@ ggplot(dels2, aes(x = Num)) +
#geom_density(stat = "count", alpha = 0.5) + #geom_density(stat = "count", alpha = 0.5) +
stat_count(geom="line", position="identity") + stat_count(geom="line", position="identity") +
stat_count(geom="point", position="identity", size = 1) + stat_count(geom="point", position="identity", size = 1) +
facet_wrap(~Sample, nrow = 2) + facet_wrap(~Sample, nrow = 3) +
#ylim(0, 1500) + #ylim(0, 1500) +
ggtitle ("Deletions per cycle") + xlab("Цикл прочтения, №")+
ylab("Количество делеций")+
ggtitle("Число делеций на цикл") +
theme_bw()+ theme_bw()+
theme(plot.title = element_text(hjust = 0.5)) theme(plot.title = element_text(hjust = 0.5))
ggsave("../report/images_new_plasmid/dpc.png", width = 2250, height = 1500, units = "px") ggsave("../report/images_new_plasmid/dpc.png", width = 2250, height = 1500, units = "px")
@ -265,8 +288,8 @@ rm(dels2)
ins2 <- data.frame(matrix(nrow = 0, ncol = 2)) ins2 <- data.frame(matrix(nrow = 0, ncol = 2))
colnames(ins2)<-c("Num", "Sample") colnames(ins2)<-c("Num", "Sample")
for (i in samples) { for (i in samples_ecoli) {
temp <- tp2[tp2$Sample==i,][,11] temp <- tp2_plasmid[tp2_plasmid$Sample==i,][,11]
temp <- unlist(strsplit(temp, split="|",fixed = TRUE)) temp <- unlist(strsplit(temp, split="|",fixed = TRUE))
temp <- unlist(strsplit(temp,",")) temp <- unlist(strsplit(temp,","))
temp<- temp[numbers_only(temp)] temp<- temp[numbers_only(temp)]
@ -278,9 +301,11 @@ ins2$Num<-as.numeric(ins2$Num)
ggplot(ins2, aes(x = Num)) + ggplot(ins2, aes(x = Num)) +
stat_count(geom="line", position="identity") + stat_count(geom="line", position="identity") +
stat_count(geom="point", position="identity", size = 1) + stat_count(geom="point", position="identity", size = 1) +
facet_wrap(~Sample, nrow = 2, scales = "free_y") + facet_wrap(~Sample, nrow = 3) +
#ylim(0, 1500) + #ylim(0, 1500) +
ggtitle ("Insertions per cycle") + xlab("Цикл прочтения, №")+
ylab("Количество вставок")+
ggtitle("Число вставок на цикл") +
scale_x_continuous(breaks=seq(0, 150, 25))+ scale_x_continuous(breaks=seq(0, 150, 25))+
theme_bw()+ theme_bw()+
theme(plot.title = element_text(hjust = 0.5)) theme(plot.title = element_text(hjust = 0.5))
@ -291,8 +316,8 @@ rm (ins2)
parse_delets2 <- data.frame(matrix(nrow = 0, ncol = 2)) parse_delets2 <- data.frame(matrix(nrow = 0, ncol = 2))
colnames(parse_delets2)<-c("Num", "Sample") colnames(parse_delets2)<-c("Num", "Sample")
for (i in samples) { for (i in samples_ecoli) {
temp <- tp2[tp2$Sample==i,][,10] temp <- tp2_plasmid[tp2_plasmid$Sample==i,][,10]
temp <- unlist(strsplit(temp,split="|",fixed = TRUE)) temp <- unlist(strsplit(temp,split="|",fixed = TRUE))
temp<-cbind("Num"=temp, "Sample" = i) temp<-cbind("Num"=temp, "Sample" = i)
temp<-data.frame(temp) temp<-data.frame(temp)
@ -300,9 +325,11 @@ for (i in samples) {
} }
ggplot(parse_delets2, aes(x=Sample, group = 1)) + ggplot(parse_delets2, aes(x=Sample, group = 1)) +
stat_count(geom="bar", width = 0.7, fill = "white", color = "black", alpha = 0.7) + stat_count(geom="bar", width = 0.7, fill = "white", color = "black", alpha = 0.7) +
ggtitle("Deletions per sample") + xlab("Образец")+
ylab("Количество делеций")+
ggtitle("Число делеций на образец") +
#geom_vline(xintercept = 50, linetype="dotted") + #geom_vline(xintercept = 50, linetype="dotted") +
theme(plot.title = element_text(hjust = 0.5)) theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1))
ggsave("../report/images_new_plasmid/dps.png", width = 2250, height = 1500, units = "px") ggsave("../report/images_new_plasmid/dps.png", width = 2250, height = 1500, units = "px")
rm (parse_delets2) rm (parse_delets2)
@ -310,8 +337,8 @@ rm (parse_delets2)
parse_inserts2 <- data.frame(matrix(nrow = 0, ncol = 2)) parse_inserts2 <- data.frame(matrix(nrow = 0, ncol = 2))
colnames(parse_inserts2)<-c("Num", "Sample") colnames(parse_inserts2)<-c("Num", "Sample")
for (i in samples) { for (i in samples_ecoli) {
temp <- tp2[tp2$Sample==i,][,11] temp <- tp2_plasmid[tp2_plasmid$Sample==i,][,11]
temp <- unlist(strsplit(temp,split="|",fixed = TRUE)) temp <- unlist(strsplit(temp,split="|",fixed = TRUE))
temp<-cbind("Num"=temp, "Sample" = i) temp<-cbind("Num"=temp, "Sample" = i)
temp<-data.frame(temp) temp<-data.frame(temp)
@ -319,9 +346,11 @@ for (i in samples) {
} }
ggplot(parse_inserts2, aes(x=Sample, group = 1)) + ggplot(parse_inserts2, aes(x=Sample, group = 1)) +
stat_count(geom="bar", width = 0.7, fill = "white", color = "black", alpha = 0.7) + stat_count(geom="bar", width = 0.7, fill = "white", color = "black", alpha = 0.7) +
ggtitle("Insertions per sample") + xlab("Образец")+
ylab("Количество вставок")+
ggtitle("Число вставок на образец") +
#geom_vline(xintercept = 50, linetype="dotted") + #geom_vline(xintercept = 50, linetype="dotted") +
theme(plot.title = element_text(hjust = 0.5)) theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1))
ggsave("../report/images_new_plasmid/ips.png", width = 2250, height = 1500, units = "px") ggsave("../report/images_new_plasmid/ips.png", width = 2250, height = 1500, units = "px")
rm (parse_inserts2, temp) rm (parse_inserts2, temp)
@ -329,8 +358,8 @@ rm (parse_inserts2, temp)
insert_motifs2 <- data.frame(matrix(nrow = 0, ncol = 2)) insert_motifs2 <- data.frame(matrix(nrow = 0, ncol = 2))
colnames(insert_motifs2)<-c("Num", "Sample") colnames(insert_motifs2)<-c("Num", "Sample")
for (i in samples) { for (i in samples_ecoli) {
temp <- tp2[tp2$Sample==i,][,11] temp <- tp2_plasmid[tp2_plasmid$Sample==i,][,11]
temp <- unlist(strsplit(temp, split="|",fixed = TRUE)) temp <- unlist(strsplit(temp, split="|",fixed = TRUE))
temp <- unlist(strsplit(temp,",")) temp <- unlist(strsplit(temp,","))
temp<- temp[!numbers_only(temp)] temp<- temp[!numbers_only(temp)]
@ -341,17 +370,19 @@ for (i in samples) {
ggplot(insert_motifs2, aes(x = Num)) + ggplot(insert_motifs2, aes(x = Num)) +
stat_count(geom='bar', width = 0.7, fill = "white", color = "black", alpha = 0.7) + stat_count(geom='bar', width = 0.7, fill = "white", color = "black", alpha = 0.7) +
facet_wrap(~Sample, nrow=4, scales = "free_y") + facet_wrap(~Sample, nrow=4, scales = "free_y") +
ggtitle("Insertion motifs per sample") + xlab("Мотив")+
ylab("Количество")+
ggtitle("Представленность мотивов вставок в образцах") +
theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1)) theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1))
ggsave("../report/images_new_plasmid/imps.png", width = 3250, height = 2500, units = "px") ggsave("../report/images_new_plasmid/imps.png", width = 7550, height = 2500, units = "px")
rm (insert_motifs2, temp) rm (insert_motifs2, temp)
#Frequency of deletion motifs per sample #Frequency of deletion motifs per sample
delete_motifs2 <- data.frame(matrix(nrow = 0, ncol = 2)) delete_motifs2 <- data.frame(matrix(nrow = 0, ncol = 2))
colnames(delete_motifs2)<-c("Num", "Sample") colnames(delete_motifs2)<-c("Num", "Sample")
for (i in samples) { for (i in samples_ecoli) {
temp <- tp2[tp2$Sample==i,][,10] temp <- tp2_plasmid[tp2_plasmid$Sample==i,][,10]
temp <- unlist(strsplit(temp, split="|",fixed = TRUE)) temp <- unlist(strsplit(temp, split="|",fixed = TRUE))
temp <- unlist(strsplit(temp,",")) temp <- unlist(strsplit(temp,","))
temp<- temp[!numbers_only(temp)] temp<- temp[!numbers_only(temp)]
@ -362,7 +393,9 @@ for (i in samples) {
ggplot(delete_motifs2, aes(x = Num)) + ggplot(delete_motifs2, aes(x = Num)) +
stat_count(geom='bar', width = 0.7, fill = "white", color = "black", alpha = 0.7) + stat_count(geom='bar', width = 0.7, fill = "white", color = "black", alpha = 0.7) +
facet_wrap(~Sample, nrow=4, scales = "free_y") + facet_wrap(~Sample, nrow=4, scales = "free_y") +
ggtitle("Deletions motifs per sample") + xlab("Мотив")+
ylab("Количество")+
ggtitle("Представленность мотивов делеций в образцах") +
theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1)) theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1))
ggsave("../report/images_new_plasmid/dmps.png", width = 4550, height = 2500, units = "px") ggsave("../report/images_new_plasmid/dmps.png", width = 7550, height = 2500, units = "px")
rm (delete_motifs2, temp) rm (delete_motifs2, temp)

BIN
qr-code.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.9 KiB