Final version
This commit is contained in:
parent
c4f360144d
commit
4e470308e1
BIN
Supplementary/Sup_ECOLI_genome_main.pdf
Normal file
BIN
Supplementary/Sup_ECOLI_genome_main.pdf
Normal file
Binary file not shown.
BIN
Supplementary/Sup_ECOLI_plasmid_main.pdf
Normal file
BIN
Supplementary/Sup_ECOLI_plasmid_main.pdf
Normal file
Binary file not shown.
BIN
Supplementary/Sup_ZVL_main.pdf
Normal file
BIN
Supplementary/Sup_ZVL_main.pdf
Normal file
Binary file not shown.
BIN
Supplementary/Sup_ZVL_pcr_free.pdf
Normal file
BIN
Supplementary/Sup_ZVL_pcr_free.pdf
Normal file
Binary file not shown.
401
chromosome.R
Normal file
401
chromosome.R
Normal file
@ -0,0 +1,401 @@
|
||||
library(ggplot2)
|
||||
library(stringr)
|
||||
#####
|
||||
#This part of scripts produces the plots of alignment data. Only genome-aligned reverse reads were taken here.
|
||||
#For every sample was created a set of reads of equal capacity (min between all sample reads).
|
||||
#####
|
||||
|
||||
#Uploading data in R
|
||||
|
||||
temp_name<-read.table("names_of_custom_genome.txt")
|
||||
samples_ecoli_chr <- c()
|
||||
for (i in temp_name[,c("V1")]) {
|
||||
samples_ecoli_chr<-c(samples_ecoli_chr, gsub("_genome_mapq20_table.txt", "", gsub("custom_", "", i)))
|
||||
}
|
||||
tp2_genome <- data.frame(matrix(ncol = 24, nrow = 0))
|
||||
colnames(tp2_genome) <- unlist(strsplit("ID;Read;Count_SNPs;Cycles;GC;Mean_length_del;Mean_length_insert;Read_length;Align_length;Dels;Inserts;A>C;A>G;A>T;C>A;C>G;C>T;G>A;G>C;G>T;T>A;T>C;T>G",";"))
|
||||
for (i in temp_name[,c("V1")]) {
|
||||
temp<-read.csv(i, sep=";", header=TRUE)
|
||||
temp<-subset(temp, Read==2)
|
||||
temp<-temp[sample(nrow(temp), 500000),]
|
||||
temp<-cbind(temp, "Sample"=gsub("_genome_mapq20_table.txt", "", gsub("custom_", "", i)))
|
||||
tp2_genome<-rbind(tp2_genome, temp)
|
||||
}
|
||||
rm(temp, temp_name)
|
||||
tp2_genome$Sample<-as.factor(tp2_genome$Sample)
|
||||
dir.create("../report/images_new_genome")
|
||||
|
||||
#GC-content
|
||||
|
||||
ggplot(data = tp2_genome, aes(x = GC)) +
|
||||
geom_density(stat = "density", fill = "pink", alpha = 0.5) +
|
||||
facet_wrap(~Sample, nrow = 3) +
|
||||
geom_vline(xintercept = 50, linetype="dotted") +
|
||||
xlab("ГЦ-состав, %") +
|
||||
ylab("Доля прочтений") +
|
||||
xlim(25, 75) +
|
||||
ylim(0, 0.15) +
|
||||
theme(plot.title = element_text(hjust = 0.5))
|
||||
ggsave("../report/images_new_genome/gc.png", width = 2250, height = 1500, units = "px")
|
||||
|
||||
#Mismatches per cycle of sequence
|
||||
|
||||
df2<-data.frame(matrix(nrow = 0, ncol = 2))
|
||||
colnames(df2)<-c("Cycles", "Sample")
|
||||
for (i in samples_ecoli_chr) {
|
||||
cyc <-subset(tp2_genome, Sample == i)[,4]
|
||||
cyc <- as.numeric(unlist(strsplit(cyc,",")))
|
||||
df2<-rbind(df2, cbind("Cycles" = cyc, "Sample" =i))
|
||||
}
|
||||
df2$Sample<-as.factor(df2$Sample)
|
||||
df2$Cycles<-as.numeric(df2$Cycles)
|
||||
rm (cyc, i)
|
||||
ggplot(df2, aes(x = Cycles)) +
|
||||
geom_density(stat = "count", fill = "blue", alpha = 0.5) +
|
||||
#geom_line(stat = "count", fill = "blue", alpha = 0.5) +
|
||||
facet_wrap(~Sample, nrow = 3) +
|
||||
xlab("Цикл прочтения, №")+
|
||||
ylab("Количество замен")+
|
||||
ggtitle("Количество однонуклеотидных замен по циклам") +
|
||||
theme(plot.title = element_text(hjust = 0.5))
|
||||
ggsave("../report/images_new_genome/mpc.png", width = 2250, height = 1500, units = "px")
|
||||
ggplot(df2, aes(x = Cycles)) +
|
||||
geom_density(stat = "density", fill = "blue", alpha = 0.5) +
|
||||
facet_wrap(~Sample, nrow = 3) +
|
||||
xlab("Цикл прочтения, №")+
|
||||
ylab("Доля от общего количества")+
|
||||
ggtitle("Количество однонуклеотидных замен по циклам") +
|
||||
theme(plot.title = element_text(hjust = 0.5))
|
||||
ggsave("../report/images_new_genome/mpc_density.png", width = 2250, height = 1500, units = "px")
|
||||
rm (df2)
|
||||
|
||||
#Mismatches depending on GC-content
|
||||
|
||||
mgc2<-cbind(tp2_genome[, c("Count_SNPs", "Sample")], "GC" = round(tp2_genome$GC))
|
||||
ggplot(mgc2, aes(x=GC, y=Count_SNPs)) +
|
||||
stat_summary(fun = sum, geom="line", linewidth = 0.5) +
|
||||
facet_wrap(~Sample, nrow = 3) +
|
||||
xlab("ГЦ-состав, %")+
|
||||
ylab("Количество однонуклеотидных замен")+
|
||||
ggtitle("Количество однонуклеотидных замен в зависимости от ГЦ-состава прочтений") +
|
||||
geom_vline(xintercept = 50, linetype="dotted") +
|
||||
theme_bw()+
|
||||
theme(plot.title = element_text(hjust = 0.5))
|
||||
ggsave("../report/images_new_genome/mgc.png", width = 2250, height = 1500, units = "px")
|
||||
ggplot(mgc2, aes(x=GC, y=Count_SNPs)) +
|
||||
stat_summary(fun = mean, geom="line", size = 0.5) +
|
||||
facet_wrap(~Sample, nrow = 3) +
|
||||
xlab("ГЦ-состав, %")+
|
||||
ylab("Среднее число однонуклеотидных замен на одно прочтение")+
|
||||
ggtitle("Количество однонуклеотидных замен в зависимости от ГЦ-состава прочтения") +
|
||||
geom_vline(xintercept = 50, linetype="dotted") +
|
||||
theme_bw()+
|
||||
theme(plot.title = element_text(hjust = 0.5))
|
||||
ggsave("../report/images_new_genome/mgc_mean.png", width = 2250, height = 1500, units = "px")
|
||||
rm (mgc2)
|
||||
|
||||
#Frequency of different types of mismatches
|
||||
|
||||
mtype2 <- data.frame(matrix(ncol = 3, nrow = 0))
|
||||
colnames(mtype2)<- c("Sum", "type", "Sample")
|
||||
for (i in samples_ecoli_chr) {
|
||||
temp <- apply(tp2_genome[, c(12:23)][tp2_genome$Sample==i,], 2, sum)
|
||||
temp <- data.frame("Sum" = temp)
|
||||
temp$type<-as.factor(colnames(tp2_genome[, c(12:23)]))
|
||||
temp<-cbind(temp, "Sample" = i)
|
||||
mtype2<-rbind(mtype2, temp)
|
||||
}
|
||||
ggplot(mtype2) +
|
||||
geom_line(aes(x = type, y = Sum, group = Sample)) +
|
||||
geom_point(aes(x = type, y = Sum)) +
|
||||
#facet_wrap(~Sample, nrow = 3, scales = "free_y") +
|
||||
facet_wrap(~Sample, nrow = 3, ) +
|
||||
xlab("Тип однонуклеотидной замены")+
|
||||
ylab("Количество однонуклеотидных замен")+
|
||||
ggtitle("Частота типов однонуклеотидных замен") +
|
||||
theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1))
|
||||
ggsave("../report/images_new_genome/mt.png", width = 2250, height = 1500, units = "px")
|
||||
|
||||
mtype2_2 <- data.frame(matrix(ncol = 3, nrow = 0))
|
||||
colnames(mtype2_2)<- c("mean", "type", "Sample")
|
||||
for (i in samples_ecoli_chr) {
|
||||
temp <- apply(tp2_genome[tp2_genome$Sample==i, c(12:23)], 2, mean)
|
||||
temp <- data.frame("mean" = temp)
|
||||
temp$type<-as.factor(colnames(tp2_genome[, c(12:23)]))
|
||||
temp<-cbind(temp, "Sample" = i)
|
||||
mtype2_2<-rbind(mtype2_2, temp)
|
||||
}
|
||||
ggplot(mtype2_2) +
|
||||
geom_line(aes(x = type, y = mean, group = Sample)) +
|
||||
geom_point(aes(x = type, y = mean)) +
|
||||
#facet_wrap(~Sample, nrow = 3, scales = "free_y") +
|
||||
facet_wrap(~Sample, nrow = 3) +
|
||||
xlab("Тип однонуклеотидной замены")+
|
||||
ylab("Среднее по прочтению")+
|
||||
ggtitle("Частота типов однонуклеотидных замен") +
|
||||
theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1))
|
||||
ggsave("../report/images_new_genome/mt_mean.png", width = 2250, height = 1500, units = "px")
|
||||
rm(mtype2, mtype2_2)
|
||||
|
||||
#Number of deletions per sample
|
||||
|
||||
delets2 <- data.frame(matrix(ncol = 2, nrow = 0))
|
||||
colnames(delets2)<- c("Nucleotide", "Sample")
|
||||
for (i in samples_ecoli_chr) {
|
||||
temp <- tp2_genome[tp2_genome$Sample==i,][,10]
|
||||
temp <- unlist(strsplit(temp,""))
|
||||
temp<- temp[temp %in% c(letters, LETTERS)]
|
||||
temp<-cbind("Nucleotide"=temp, "Sample" = i)
|
||||
temp<-data.frame(temp)
|
||||
temp$Nucleotide <- as.factor(temp$Nucleotide)
|
||||
delets2<-rbind(temp, delets2)
|
||||
}
|
||||
ggplot(delets2) +
|
||||
geom_bar(aes(x = Nucleotide), width = 0.4, fill = "blue", alpha = 0.5, col = "black") +
|
||||
facet_wrap(~Sample, nrow = 3) +
|
||||
#ylim(0, 1500) +
|
||||
xlab("Нуклеотид")+
|
||||
ylab("Количество делеций")+
|
||||
ggtitle("Состав делеций") +
|
||||
theme(plot.title = element_text(hjust = 0.5))
|
||||
ggsave("../report/images_new_genome/delets.png", width = 2250, height = 1500, units = "px")
|
||||
rm (delets2)
|
||||
|
||||
#Number of insertions per sample
|
||||
|
||||
inserts2 <- data.frame(matrix(ncol = 2, nrow = 0))
|
||||
colnames(inserts2)<- c("Nucleotide", "Sample")
|
||||
for (i in samples_ecoli_chr) {
|
||||
temp <- tp2_genome[tp2_genome$Sample==i,][,11]
|
||||
temp <- unlist(strsplit(temp,""))
|
||||
temp<- temp[temp %in% c(letters, LETTERS)]
|
||||
temp<-cbind("Nucleotide"=temp, "Sample" = i)
|
||||
temp<-data.frame(temp)
|
||||
temp$Nucleotide <- as.factor(temp$Nucleotide)
|
||||
inserts2<-rbind(temp, inserts2)
|
||||
}
|
||||
ggplot(inserts2) +
|
||||
xlab("Нуклеотид")+
|
||||
ylab("Количество вставок")+
|
||||
ggtitle("Состав вставок") +
|
||||
geom_bar(aes(x = Nucleotide), width = 0.4, fill = "blue", alpha = 0.5, col = "black") +
|
||||
facet_wrap(~Sample, nrow = 3) +
|
||||
#ylim(0, 2000) +
|
||||
theme(plot.title = element_text(hjust = 0.5))
|
||||
ggsave("../report/images_new_genome/inserts.png", width = 2250, height = 1500, units = "px")
|
||||
rm (inserts2)
|
||||
|
||||
#Mean, min and max values of insertions per sample
|
||||
|
||||
insert_mean2 <- data.frame(matrix(ncol = 7, nrow = 0))
|
||||
colnames(insert_mean2)<- c("Sample", "Min.", "1st Qu.", "Median", "Mean", "3rd Qu.", "Max.")
|
||||
for (i in samples_ecoli_chr) {
|
||||
temp<-subset(tp2_genome, Mean_length_del!=0 & Sample == i)[,7]
|
||||
temp<-cbind("Sample"=i, t(summary(temp)))
|
||||
insert_mean2<-rbind(insert_mean2, temp)
|
||||
}
|
||||
insert_mean2[, 2]<-as.numeric(insert_mean2[, 2])
|
||||
insert_mean2[, 4]<-as.numeric(insert_mean2[, 4])
|
||||
insert_mean2[, 5]<-as.numeric(insert_mean2[, 5])
|
||||
insert_mean2[, 7]<-as.numeric(insert_mean2[, 7])
|
||||
ggplot(data.frame(insert_mean2), aes(x = factor(Sample), y = Mean), size = 0.5) +
|
||||
geom_pointrange(aes(ymin = 0, ymax = Max.)) +
|
||||
geom_point(aes(x = factor(Sample), y = 0), shape = 1) +
|
||||
geom_point(aes(x = factor(Sample), y = Max.), shape = 1) +
|
||||
xlab("Образец")+
|
||||
ylab("Количество нуклеотидов")+
|
||||
ggtitle("Статистика длины вставки") +
|
||||
theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1))
|
||||
ggsave("../report/images_new_genome/iml.png", width = 2250, height = 1500, units = "px")
|
||||
rm (insert_mean2)
|
||||
|
||||
#Mean, min and max values of deletions per sample
|
||||
|
||||
delete_mean2 <- data.frame(matrix(ncol = 7, nrow = 0))
|
||||
colnames(delete_mean2)<- c("Sample", "Min.", "1st Qu.", "Median", "Mean", "3rd Qu.", "Max.")
|
||||
for (i in samples_ecoli_chr) {
|
||||
temp<-subset(tp2_genome, Mean_length_del!=0 & Sample == i)[,6]
|
||||
temp<-cbind("Sample"=i, t(summary(temp)))
|
||||
delete_mean2<-rbind(delete_mean2, temp)
|
||||
}
|
||||
delete_mean2[, 2]<-as.numeric(delete_mean2[, 2])
|
||||
delete_mean2[, 4]<-as.numeric(delete_mean2[, 4])
|
||||
delete_mean2[, 5]<-as.numeric(delete_mean2[, 5])
|
||||
delete_mean2[, 7]<-as.numeric(delete_mean2[, 7])
|
||||
ggplot(data.frame(delete_mean2), aes(x = factor(Sample), y = Mean), size = 0.5) +
|
||||
geom_pointrange(aes(ymin = 0, ymax = Max.)) +
|
||||
geom_point(aes(x = factor(Sample), y = 0), shape = 1) +
|
||||
geom_point(aes(x = factor(Sample), y = Max.), shape = 1) +
|
||||
xlab("Образец")+
|
||||
ylab("Количество нуклеотидов")+
|
||||
ggtitle("Статистика длины делеции") +
|
||||
theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1))
|
||||
ggsave("../report/images_new_genome/dml.png", width = 2250, height = 1500, units = "px")
|
||||
rm(delete_mean2)
|
||||
|
||||
#Number of mismatches per sample
|
||||
|
||||
ggplot(tp2_genome, aes(x=Sample, y=Count_SNPs, group = 1)) +
|
||||
stat_summary(fun = sum, geom="bar", width = 0.7, fill = "white", color = "black", alpha = 0.7) +
|
||||
xlab("Образец")+
|
||||
ylab("Количество замен")+
|
||||
ggtitle("Количество однонуклеотидных замен") +
|
||||
#geom_vline(xintercept = 50, linetype="dotted") +
|
||||
theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1))
|
||||
ggsave("../report/images_new_genome/mps_mean.png", width = 2250, height = 1500, units = "px")
|
||||
|
||||
|
||||
ggplot(tp2_genome, aes(x=Sample, y=Count_SNPs, group = 1)) +
|
||||
stat_summary(fun = mean, geom="bar", width = 0.7, fill = "white", color = "black", alpha = 0.7) +
|
||||
xlab("Образец")+
|
||||
ylab("Среднее число замен")+
|
||||
ggtitle("Количество однонуклеотидных замен на одно прочтение") +
|
||||
#geom_vline(xintercept = 50, linetype="dotted") +
|
||||
theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1))
|
||||
ggsave("../report/images_new_genome/mps.png", width = 2250, height = 1500, units = "px")
|
||||
|
||||
#Number of deletions per cycle of sequence
|
||||
|
||||
numbers_only <- function(x) !grepl("\\D", x)
|
||||
|
||||
dels2 <- data.frame(matrix(nrow = 0, ncol = 2))
|
||||
colnames(dels2)<-c("Num", "Sample")
|
||||
for (i in samples_ecoli_chr) {
|
||||
temp <- tp2_genome[tp2_genome$Sample==i,][,10]
|
||||
temp <- unlist(strsplit(temp, split="|",fixed = TRUE))
|
||||
temp <- unlist(strsplit(temp,","))
|
||||
temp<- temp[numbers_only(temp)]
|
||||
temp<-cbind("Num"=temp, "Sample" = i)
|
||||
temp<-data.frame(temp)
|
||||
dels2<-rbind(temp, dels2)
|
||||
}
|
||||
dels2$Num<-as.numeric(dels2$Num)
|
||||
ggplot(dels2, aes(x = Num)) +
|
||||
#geom_density(stat = "count", alpha = 0.5) +
|
||||
stat_count(geom="line", position="identity") +
|
||||
stat_count(geom="point", position="identity", size = 1) +
|
||||
facet_wrap(~Sample, nrow = 3) +
|
||||
#ylim(0, 1500) +
|
||||
xlab("Цикл прочтения, №")+
|
||||
ylab("Количество делеций")+
|
||||
ggtitle("Число делеций на цикл") +
|
||||
theme_bw()+
|
||||
theme(plot.title = element_text(hjust = 0.5))
|
||||
ggsave("../report/images_new_genome/dpc.png", width = 2250, height = 1500, units = "px")
|
||||
rm(dels2)
|
||||
|
||||
#Number of insertions per cycle of sequence
|
||||
|
||||
ins2 <- data.frame(matrix(nrow = 0, ncol = 2))
|
||||
colnames(ins2)<-c("Num", "Sample")
|
||||
for (i in samples_ecoli_chr) {
|
||||
temp <- tp2_genome[tp2_genome$Sample==i,][,11]
|
||||
temp <- unlist(strsplit(temp, split="|",fixed = TRUE))
|
||||
temp <- unlist(strsplit(temp,","))
|
||||
temp<- temp[numbers_only(temp)]
|
||||
temp<-cbind("Num"=temp, "Sample" = i)
|
||||
temp<-data.frame(temp)
|
||||
ins2<-rbind(temp, ins2)
|
||||
}
|
||||
ins2$Num<-as.numeric(ins2$Num)
|
||||
ggplot(ins2, aes(x = Num)) +
|
||||
stat_count(geom="line", position="identity") +
|
||||
stat_count(geom="point", position="identity", size = 1) +
|
||||
facet_wrap(~Sample, nrow = 3) +
|
||||
#ylim(0, 1500) +
|
||||
xlab("Цикл прочтения, №")+
|
||||
ylab("Количество вставок")+
|
||||
ggtitle("Число вставок на цикл") +
|
||||
scale_x_continuous(breaks=seq(0, 150, 25))+
|
||||
theme_bw()+
|
||||
theme(plot.title = element_text(hjust = 0.5))
|
||||
ggsave("../report/images_new_genome/ipc.png", width = 2250, height = 1500, units = "px")
|
||||
rm (ins2)
|
||||
|
||||
#Number of deletions per sample
|
||||
|
||||
parse_delets2 <- data.frame(matrix(nrow = 0, ncol = 2))
|
||||
colnames(parse_delets2)<-c("Num", "Sample")
|
||||
for (i in samples_ecoli_chr) {
|
||||
temp <- tp2_genome[tp2_genome$Sample==i,][,10]
|
||||
temp <- unlist(strsplit(temp,split="|",fixed = TRUE))
|
||||
temp<-cbind("Num"=temp, "Sample" = i)
|
||||
temp<-data.frame(temp)
|
||||
parse_delets2<-rbind(temp, parse_delets2)
|
||||
}
|
||||
ggplot(parse_delets2, aes(x=Sample, group = 1)) +
|
||||
stat_count(geom="bar", width = 0.7, fill = "white", color = "black", alpha = 0.7) +
|
||||
xlab("Образец")+
|
||||
ylab("Количество делеций")+
|
||||
ggtitle("Число делеций на образец") +
|
||||
#geom_vline(xintercept = 50, linetype="dotted") +
|
||||
theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1))
|
||||
ggsave("../report/images_new_genome/dps.png", width = 2250, height = 1500, units = "px")
|
||||
rm (parse_delets2)
|
||||
|
||||
#Number of insertions per sample
|
||||
|
||||
parse_inserts2 <- data.frame(matrix(nrow = 0, ncol = 2))
|
||||
colnames(parse_inserts2)<-c("Num", "Sample")
|
||||
for (i in samples_ecoli_chr) {
|
||||
temp <- tp2_genome[tp2_genome$Sample==i,][,11]
|
||||
temp <- unlist(strsplit(temp,split="|",fixed = TRUE))
|
||||
temp<-cbind("Num"=temp, "Sample" = i)
|
||||
temp<-data.frame(temp)
|
||||
parse_inserts2<-rbind(temp, parse_inserts2)
|
||||
}
|
||||
ggplot(parse_inserts2, aes(x=Sample, group = 1)) +
|
||||
stat_count(geom="bar", width = 0.7, fill = "white", color = "black", alpha = 0.7) +
|
||||
xlab("Образец")+
|
||||
ylab("Количество вставок")+
|
||||
ggtitle("Число вставок на образец") +
|
||||
#geom_vline(xintercept = 50, linetype="dotted") +
|
||||
theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1))
|
||||
ggsave("../report/images_new_genome/ips.png", width = 2250, height = 1500, units = "px")
|
||||
rm (parse_inserts2, temp)
|
||||
|
||||
#Frequency of insertion motifs per sample
|
||||
|
||||
insert_motifs2 <- data.frame(matrix(nrow = 0, ncol = 2))
|
||||
colnames(insert_motifs2)<-c("Num", "Sample")
|
||||
for (i in samples_ecoli_chr) {
|
||||
temp <- tp2_genome[tp2_genome$Sample==i,][,11]
|
||||
temp <- unlist(strsplit(temp, split="|",fixed = TRUE))
|
||||
temp <- unlist(strsplit(temp,","))
|
||||
temp<- temp[!numbers_only(temp)]
|
||||
temp<-cbind("Num"=temp, "Sample" = i)
|
||||
temp<-data.frame(temp)
|
||||
insert_motifs2<-rbind(temp, insert_motifs2)
|
||||
}
|
||||
ggplot(insert_motifs2, aes(x = Num)) +
|
||||
stat_count(geom='bar', width = 0.7, fill = "white", color = "black", alpha = 0.7) +
|
||||
facet_wrap(~Sample, nrow=4, scales = "free_y") +
|
||||
xlab("Мотив")+
|
||||
ylab("Количество")+
|
||||
ggtitle("Представленность мотивов вставок в образцах") +
|
||||
theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1))
|
||||
ggsave("../report/images_new_genome/imps.png", width = 10550, height = 2500, units = "px")
|
||||
rm (insert_motifs2, temp)
|
||||
|
||||
#Frequency of deletion motifs per sample
|
||||
|
||||
delete_motifs2 <- data.frame(matrix(nrow = 0, ncol = 2))
|
||||
colnames(delete_motifs2)<-c("Num", "Sample")
|
||||
for (i in samples_ecoli_chr) {
|
||||
temp <- tp2_genome[tp2_genome$Sample==i,][,10]
|
||||
temp <- unlist(strsplit(temp, split="|",fixed = TRUE))
|
||||
temp <- unlist(strsplit(temp,","))
|
||||
temp<- temp[!numbers_only(temp)]
|
||||
temp<-cbind("Num"=temp, "Sample" = i)
|
||||
temp<-data.frame(temp)
|
||||
delete_motifs2<-rbind(temp, delete_motifs2)
|
||||
}
|
||||
ggplot(delete_motifs2, aes(x = Num)) +
|
||||
stat_count(geom='bar', width = 0.7, fill = "white", color = "black", alpha = 0.7) +
|
||||
facet_wrap(~Sample, nrow=4, scales = "free_y") +
|
||||
xlab("Мотив")+
|
||||
ylab("Количество")+
|
||||
ggtitle("Представленность мотивов делеций в образцах") +
|
||||
theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1))
|
||||
ggsave("../report/images_new_genome/dmps.png", width = 10550, height = 2500, units = "px")
|
||||
rm (delete_motifs2, temp)
|
46
custom_lib_fastp_zvl_without_assembly.sh
Normal file
46
custom_lib_fastp_zvl_without_assembly.sh
Normal file
@ -0,0 +1,46 @@
|
||||
#!/bin/bash
|
||||
source /storage/data1/marmi/miniconda/installation/etc/profile.d/conda.sh
|
||||
conda activate marmi_methylome
|
||||
|
||||
#перед каждым запуском заполнять эти пять переменных и path_to_sample
|
||||
input_file="libs_11_12_23.csv"
|
||||
cell="V350165701"
|
||||
path_to_run="/storage/data1/aug/V350165701_repair"
|
||||
goal_dir="custom_libs_E_Coli_Zvl"
|
||||
path_to_assembly="/storage/data1/marmi/dec_custom_libs"
|
||||
|
||||
path_to_python="/storage/data1/marmi/trial_scripts/split_assembly.py"
|
||||
path_to_python_chr="/storage/data1/marmi/trial_scripts/chromosome.py"
|
||||
#выкидываем заголовок и заменяем пробелы на нижние подчеркивания
|
||||
cp $input_file source_csv.csv
|
||||
sed -i 's/ /_/g' source_csv.csv
|
||||
|
||||
<source_csv.csv awk -v a_cell="${cell}" -F"," 'NR!=1{print $1 " " a_cell "_L0" $4 "_" $3 "_1.fq.gz" " " a_cell "_L0" $4 "_" $3 "_2.fq.gz" " " $2 " " $4}' >reads_fastp.txt
|
||||
rm source_csv.csv
|
||||
|
||||
cat reads_fastp.txt | while read i || [[ -n $i ]];
|
||||
do
|
||||
dir=`echo $i| awk '{split($0, array); print array[1]}'`
|
||||
if [ "$dir" == "$goal_dir" ]; then
|
||||
fn1=`echo $i| awk '{split($0, array); print array[2]}'`
|
||||
fn2=`echo $i| awk '{split($0, array); print array[3]}'`
|
||||
sample=`echo $i| awk '{split($0, array); print array[4]}'`
|
||||
lane=`echo $i| awk '{split($0, array); print array[5]}'`
|
||||
path_to_sample="/storage/data1/marmi/dec_custom_libs_repaired_${sample}"
|
||||
mkdir ${path_to_sample}
|
||||
mkdir ${path_to_sample}/qc_source_files
|
||||
cd ${path_to_sample}/qc_source_files
|
||||
read1="${path_to_run}/L0${lane}/${fn1}"
|
||||
read2="${path_to_run}/L0${lane}/${fn2}"
|
||||
fastp -e 30 -w 7 --in1 ${read1} --in2 ${read2} \
|
||||
--out1 custom_${sample}_1.fastq.gz --out2 custom_${sample}_2.fastq.gz \
|
||||
--unpaired1 custom_${sample}_u.fastq.gz --unpaired2 custom_${sample}_u.fastq.gz \
|
||||
1>fastp_custom_${sample}_output.txt 2>fastp_custom_${sample}_err.txt
|
||||
fi
|
||||
done
|
||||
#НЕ УДАЛЯТЬ READS_FASTP.TXT
|
||||
|
||||
conda deactivate
|
||||
conda activate marmi_telegram
|
||||
telegram-send "Work_custom_union_assembly first step ends at `date`."
|
||||
conda deactivate
|
61
custom_lib_union_align_zvl.sh
Normal file
61
custom_lib_union_align_zvl.sh
Normal file
@ -0,0 +1,61 @@
|
||||
#!/bin/bash
|
||||
source /storage/data1/marmi/miniconda/installation/etc/profile.d/conda.sh
|
||||
conda activate marmi_methylome
|
||||
#перед каждым запуском заполнять эти три переменных и path_to_sample
|
||||
path_to_assembly="/storage/data1/marmi/dec_custom_libs_zvl/unicycler_assembly_custom"
|
||||
goal_dir="custom_libs_E_Coli_Zvl"
|
||||
path_to_tables="/storage/data1/marmi/dec_custom_libs_zvl/info_tables_repaired"
|
||||
mkdir /storage/data1/marmi/dec_custom_libs_zvl
|
||||
path_to_python="/storage/data1/marmi/trial_scripts/info_pol_mapq_union.py"
|
||||
mkdir ${path_to_tables}
|
||||
touch ${path_to_tables}/names_of_custom_chromosome.txt
|
||||
touch ${path_to_tables}/names_of_custom_plasmid.txt
|
||||
cat reads_fastp.txt | while read i || [[ -n $i ]];
|
||||
do
|
||||
dir=`echo $i| awk '{split($0, array); print array[1]}'`
|
||||
if [ "$dir" == "$goal_dir" ]; then
|
||||
sample=`echo $i| awk '{split($0, array); print array[4]}'`
|
||||
path_to_sample="/storage/data1/marmi/dec_custom_libs_repaired_${sample}"
|
||||
name_of_dir="union_${sample}"
|
||||
mkdir ${path_to_sample}/${name_of_dir}
|
||||
cd ${path_to_sample}/${name_of_dir}
|
||||
bwa mem -t 7 ${path_to_assembly}/assembly.fasta ../qc_source_files/custom_${sample}_1.fastq.gz \
|
||||
../qc_source_files/custom_${sample}_2.fastq.gz >custom_${sample}_paired.sam \
|
||||
2>bwa_mem_custom_${sample}_paired.log
|
||||
bwa mem -t 7 ${path_to_assembly}/assembly.fasta ../qc_source_files/custom_${sample}_u.fastq.gz \
|
||||
>custom_${sample}_unpaired.sam \
|
||||
2>bwa_mem_custom_${sample}_unpaired.log
|
||||
samtools view -Sb -o custom_${sample}_paired.bam custom_${sample}_paired.sam
|
||||
samtools view -Sb -o custom_${sample}_unpaired.bam custom_${sample}_unpaired.sam
|
||||
rm custom_${sample}_paired.sam
|
||||
rm custom_${sample}_unpaired.sam
|
||||
samtools sort -o custom_${sample}_paired_sorted.bam custom_${sample}_paired.bam
|
||||
samtools sort -o custom_${sample}_unpaired_sorted.bam custom_${sample}_unpaired.bam
|
||||
samtools merge -o custom_${sample}.bam \
|
||||
custom_${sample}_paired_sorted.bam custom_${sample}_unpaired_sorted.bam
|
||||
samtools index custom_${sample}.bam
|
||||
rm custom_${sample}_paired_sorted.bam
|
||||
rm custom_${sample}_unpaired_sorted.bam
|
||||
rm custom_${sample}_paired.bam
|
||||
rm custom_${sample}_unpaired.bam
|
||||
touch custom_${sample}_genome.txt
|
||||
touch custom_${sample}_plasmid.txt
|
||||
cat ${path_to_assembly}/chromosome_genome.txt | while read j || [[ -n $j ]];
|
||||
do
|
||||
samtools view -F 4 custom_${sample}.bam $j | grep 'MD:Z' >> custom_${sample}_genome.txt
|
||||
done
|
||||
cat ${path_to_assembly}/chromosome_plasmid.txt | while read j || [[ -n $j ]];
|
||||
do
|
||||
samtools view -F 4 custom_${sample}.bam $j | grep 'MD:Z' >> custom_${sample}_plasmid.txt
|
||||
done
|
||||
python3 ${path_to_python} ${sample}_genome ${path_to_sample}/${name_of_dir}/ ${path_to_tables}/
|
||||
python3 ${path_to_python} ${sample}_plasmid ${path_to_sample}/${name_of_dir}/ ${path_to_tables}/
|
||||
echo "custom_${sample}_genome_mapq20_table.txt" >>${path_to_tables}/names_of_custom_chromosome.txt
|
||||
echo "custom_${sample}_plasmid_mapq20_table.txt" >>${path_to_tables}/names_of_custom_plasmid.txt
|
||||
|
||||
fi
|
||||
done
|
||||
conda deactivate
|
||||
conda activate marmi_telegram
|
||||
telegram-send "Work_custom_union_align second step ends at `date`."
|
||||
conda deactivate
|
170
info_pol_mapq_union.py
Normal file
170
info_pol_mapq_union.py
Normal file
@ -0,0 +1,170 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import sys
|
||||
import re
|
||||
|
||||
def GC(buf):
|
||||
gc = buf.upper().count('G') + buf.upper().count('C')
|
||||
all = buf.upper().count('A') + buf.upper().count('T') + gc
|
||||
return (gc / all * 100)
|
||||
|
||||
def len_align(buf):
|
||||
"""
|
||||
нужно передавать CIGAR
|
||||
"""
|
||||
let = re.split('[0-9]+', buf)
|
||||
let.remove("")
|
||||
num = re.split('[MIDSH]{1}', buf)
|
||||
num.remove("")
|
||||
num = list(map(int, num))
|
||||
length = 0
|
||||
length_read = 0
|
||||
for i in range(len(let)):
|
||||
if (let[i]=="M") or (let[i]=="I"):
|
||||
length+=num[i]
|
||||
length_read+=num[i]
|
||||
elif ((let[i]=="S") or (let[i]=="H")):
|
||||
length_read+=num[i]
|
||||
return (length, length_read)
|
||||
|
||||
def info (buf, arg = "I"):
|
||||
"""
|
||||
Передавать CIGAR
|
||||
"""
|
||||
let = re.split('[0-9]+', buf)
|
||||
let.remove("")
|
||||
num = re.split('[MIDSH]{1}', buf)
|
||||
num.remove("")
|
||||
num = list(map(int, num))
|
||||
count = 0
|
||||
for i in range(len(let)):
|
||||
if (let[i]==arg):
|
||||
count+=num[i]
|
||||
return count
|
||||
|
||||
def info_insert (buf):
|
||||
"""
|
||||
Передавать CIGAR, возращает позиции инсерции, включая левый символ и не включая правый символ
|
||||
"""
|
||||
let = re.split('[0-9]+', buf)
|
||||
let.remove("")
|
||||
num = re.split('[MIDSH]{1}', buf)
|
||||
num.remove("")
|
||||
num = list(map(int, num))
|
||||
start = 0
|
||||
intervals = []
|
||||
for i in range(len(let)):
|
||||
if (let[i]=="I"):
|
||||
intervals.append([start, start+num[i]])
|
||||
if (let[i]=="D") or (let[i]=="H"):
|
||||
continue
|
||||
start+=num[i]
|
||||
return tuple(intervals)
|
||||
|
||||
def removal (buf):
|
||||
"""
|
||||
Передавать CIGAR, возращает позиции удаления, включая левый символ и не включая правый символ
|
||||
"""
|
||||
let = re.split('[0-9]+', buf)
|
||||
let.remove("")
|
||||
num = re.split('[MIDSH]{1}', buf)
|
||||
num.remove("")
|
||||
num = list(map(int, num))
|
||||
start = 0
|
||||
intervals = []
|
||||
for i in range(len(let)):
|
||||
if (let[i]=="I") or (let[i]=="S"):
|
||||
intervals.append([start, start+num[i]])
|
||||
if (let[i]=="D") or (let[i]=="H"):
|
||||
continue
|
||||
start+=num[i]
|
||||
return tuple(intervals)
|
||||
|
||||
def type_snp_text (seq, buf, cigar):
|
||||
positions_ins = info_insert(cigar)
|
||||
inserts_text = ''
|
||||
inserts = []
|
||||
for i in positions_ins:
|
||||
inserts.append((seq[i[0]:i[1]], i[0]))
|
||||
inserts_text = inserts_text + str(seq[i[0]:i[1]]) + "," + str(i[0]) + '|'
|
||||
inserts_text = inserts_text[:-1]
|
||||
gc_per = GC(seq)
|
||||
positions = removal(cigar)
|
||||
for i in positions:
|
||||
if (i[1]<(len(seq))):
|
||||
seq = seq[:i[0]] + "_"*(i[1]-i[0]) + seq[i[1]:]
|
||||
else:
|
||||
seq = seq[:i[0]] + "_"*(i[1]-i[0])
|
||||
seq = seq.replace("_", "")
|
||||
buf = buf.replace("MD:Z:", "")
|
||||
types = {"A>C":0, "A>G":0, "A>T":0, "C>A":0, "C>G":0, "C>T":0, "G>A":0, "G>C":0, "G>T":0, "T>A":0, "T>C":0, "T>G":0}
|
||||
cycles_text = ''
|
||||
dels_text = ''
|
||||
cycles = []
|
||||
dels = []
|
||||
let = re.split('[0-9]+', buf)
|
||||
let.pop(0)
|
||||
num = re.split('[ATGC^]+', buf)
|
||||
num = list(map(int, num))
|
||||
iter = 0
|
||||
for i in range(len(num)):
|
||||
iter+=num[i]
|
||||
if (let[i] == '') or (let[i] == '\n'):
|
||||
continue
|
||||
if (let[i][0] == '^'):
|
||||
dels_text = dels_text + str(let[i][1:]) + "," + str(iter) + "|"
|
||||
dels.append((let[i][1:], iter))
|
||||
continue
|
||||
if (seq[iter]=="A") or (seq[iter]=="C") or (seq[iter]=="G") or (seq[iter]=="T"):
|
||||
key = let[i] + ">" + seq[iter]
|
||||
types[key]+=1
|
||||
cycles.append(iter)
|
||||
cycles_text = cycles_text + str(iter) + ","
|
||||
iter+=1
|
||||
else:
|
||||
iter+=1
|
||||
l = len_align(cigar)
|
||||
dels_text = dels_text[:-1]
|
||||
cycles_text = cycles_text[:-1]
|
||||
if len(dels)!=0:
|
||||
len_del = sum([len(dels[i][0]) for i in range(len(dels))])/len(dels)
|
||||
else:
|
||||
len_del = 0
|
||||
if len(inserts)!=0:
|
||||
len_ins = sum([len(inserts[i][0]) for i in range(len(inserts))])/len(inserts)
|
||||
else:
|
||||
len_ins = 0
|
||||
result = {"SNP": types, "Cycles": cycles_text, "Dels": dels_text, "Inserts" : inserts_text, "GC": gc_per, "Count_SNPs": sum(types.values()), "Mean_length_del": len_del, \
|
||||
"Mean_length_insert": len_ins, "Read_length": l[1], "Align_length": l[0]}
|
||||
return result
|
||||
|
||||
|
||||
|
||||
with open(sys.argv[3]+"custom_"+sys.argv[1]+"_mapq20_table.txt", 'w') as table:
|
||||
header = "ID;Read;Count_SNPs;Cycles;GC;Mean_length_del;Mean_length_insert;Read_length;Align_length;Dels;Inserts;A>C;A>G;A>T;C>A;C>G;C>T;G>A;G>C;G>T;T>A;T>C;T>G\n"
|
||||
table.write(header)
|
||||
with open(sys.argv[2]+"custom_"+sys.argv[1]+".txt", 'r') as inp:
|
||||
for line in inp:
|
||||
columns = line.split("\t")
|
||||
if int(columns[4])>20:
|
||||
id = columns[0]
|
||||
flag = columns[1]
|
||||
binary = "{0:b}".format(int(flag))
|
||||
if (binary[-1]=='0'):
|
||||
typ = 0
|
||||
elif len(binary)==7:
|
||||
typ = 1
|
||||
elif len(binary)>=8:
|
||||
if (binary[-7]=='0') and (binary[-8]=='1'):
|
||||
typ = 2
|
||||
else:
|
||||
typ = -1
|
||||
else:
|
||||
typ = -1
|
||||
cigar = columns[5]
|
||||
md = columns[12]
|
||||
seq = columns[9]
|
||||
inf = type_snp_text(seq, md, cigar)
|
||||
stroke = id + ";" + str(typ) + ";" + str(inf['Count_SNPs']) + ";" + inf['Cycles'] + ";" + str(inf['GC']) + ";" + str(inf['Mean_length_del']) + ";" + str(inf['Mean_length_insert']) + ";" + str(inf['Read_length']) + ";" + str(inf['Align_length']) + ";" + inf['Dels'] + ";" + inf['Inserts'] + ";" + \
|
||||
str(inf['SNP']['A>C']) + ";" + str(inf['SNP']['A>G']) + ";" + str(inf['SNP']['A>T']) + ";" + str(inf['SNP']['C>A']) + ";" + str(inf['SNP']['C>G']) + ";" + str(inf['SNP']['C>T']) + ";" + str(inf['SNP']['G>A']) + ";" + str(inf['SNP']['G>C']) + ";" + str(inf['SNP']['G>T']) + ";" + str(inf['SNP']['T>A']) + ";" + str(inf['SNP']['T>C']) + ";" + str(inf['SNP']['T>G']) + '\n'
|
||||
table.write(stroke)
|
201
plasmid.R
201
plasmid.R
@ -8,29 +8,31 @@ library(stringr)
|
||||
#Uploading data in R
|
||||
|
||||
temp_name<-read.table("names_of_custom_plasmid.txt")
|
||||
samples <- c()
|
||||
samples_ecoli <- c()
|
||||
for (i in temp_name[,c("V1")]) {
|
||||
samples<-c(samples, gsub("_plasmid_mapq20_table.txt", "", gsub("custom_", "", i)))
|
||||
samples_ecoli<-c(samples_ecoli, gsub("_plasmid_mapq20_table.txt", "", gsub("custom_", "", i)))
|
||||
}
|
||||
tp2 <- data.frame(matrix(ncol = 24, nrow = 0))
|
||||
colnames(tp2) <- unlist(strsplit("ID;Read;Count_SNPs;Cycles;GC;Mean_length_del;Mean_length_insert;Read_length;Align_length;Dels;Inserts;A>C;A>G;A>T;C>A;C>G;C>T;G>A;G>C;G>T;T>A;T>C;T>G",";"))
|
||||
tp2_plasmid <- data.frame(matrix(ncol = 24, nrow = 0))
|
||||
colnames(tp2_plasmid) <- unlist(strsplit("ID;Read;Count_SNPs;Cycles;GC;Mean_length_del;Mean_length_insert;Read_length;Align_length;Dels;Inserts;A>C;A>G;A>T;C>A;C>G;C>T;G>A;G>C;G>T;T>A;T>C;T>G",";"))
|
||||
for (i in temp_name[,c("V1")]) {
|
||||
temp<-read.csv(i, sep=";", header=TRUE)
|
||||
temp<-subset(temp, Read==2)
|
||||
temp<-temp[sample(nrow(temp), 200000),]
|
||||
temp<-temp[sample(nrow(temp), 100000),]
|
||||
temp<-cbind(temp, "Sample"=gsub("_plasmid_mapq20_table.txt", "", gsub("custom_", "", i)))
|
||||
tp2<-rbind(tp2, temp)
|
||||
tp2_plasmid<-rbind(tp2_plasmid, temp)
|
||||
}
|
||||
rm(temp, temp_name)
|
||||
tp2$Sample<-as.factor(tp2$Sample)
|
||||
tp2_plasmid$Sample<-as.factor(tp2_plasmid$Sample)
|
||||
dir.create("../report/images_new_plasmid")
|
||||
|
||||
#GC-content
|
||||
|
||||
ggplot(data = tp2, aes(x = GC)) +
|
||||
ggplot(data = tp2_plasmid, aes(x = GC)) +
|
||||
geom_density(stat = "density", fill = "pink", alpha = 0.5) +
|
||||
facet_wrap(~Sample, nrow = 2) +
|
||||
facet_wrap(~Sample, nrow = 3) +
|
||||
geom_vline(xintercept = 50, linetype="dotted") +
|
||||
xlab("ГЦ-состав, %") +
|
||||
ylab("Доля прочтений") +
|
||||
xlim(25, 75) +
|
||||
ylim(0, 0.15) +
|
||||
theme(plot.title = element_text(hjust = 0.5))
|
||||
@ -40,8 +42,8 @@ ggsave("../report/images_new_plasmid/gc.png", width = 2250, height = 1500, units
|
||||
|
||||
df2<-data.frame(matrix(nrow = 0, ncol = 2))
|
||||
colnames(df2)<-c("Cycles", "Sample")
|
||||
for (i in samples) {
|
||||
cyc <-subset(tp2, Sample == i)[,4]
|
||||
for (i in samples_ecoli) {
|
||||
cyc <-subset(tp2_plasmid, Sample == i)[,4]
|
||||
cyc <- as.numeric(unlist(strsplit(cyc,",")))
|
||||
df2<-rbind(df2, cbind("Cycles" = cyc, "Sample" =i))
|
||||
}
|
||||
@ -51,33 +53,41 @@ rm (cyc, i)
|
||||
ggplot(df2, aes(x = Cycles)) +
|
||||
geom_density(stat = "count", fill = "blue", alpha = 0.5) +
|
||||
#geom_line(stat = "count", fill = "blue", alpha = 0.5) +
|
||||
facet_wrap(~Sample, nrow = 2) +
|
||||
ggtitle("Mismatches per cycle") +
|
||||
facet_wrap(~Sample, nrow = 3) +
|
||||
xlab("Цикл прочтения, №")+
|
||||
ylab("Количество замен")+
|
||||
ggtitle("Количество однонуклеотидных замен по циклам") +
|
||||
theme(plot.title = element_text(hjust = 0.5))
|
||||
ggsave("../report/images_new_plasmid/mpc.png", width = 2250, height = 1500, units = "px")
|
||||
ggplot(df2, aes(x = Cycles)) +
|
||||
geom_density(stat = "density", fill = "blue", alpha = 0.5) +
|
||||
facet_wrap(~Sample, nrow = 2) +
|
||||
ggtitle("Mismatches per cycle") +
|
||||
facet_wrap(~Sample, nrow = 3) +
|
||||
xlab("Цикл прочтения, №")+
|
||||
ylab("Доля от общего количества")+
|
||||
ggtitle("Количество однонуклеотидных замен по циклам") +
|
||||
theme(plot.title = element_text(hjust = 0.5))
|
||||
ggsave("../report/images_new_plasmid/mpc_density.png", width = 2250, height = 1500, units = "px")
|
||||
rm (df2)
|
||||
|
||||
#Mismatches depending on GC-content
|
||||
|
||||
mgc2<-cbind(tp2[, c("Count_SNPs", "Sample")], "GC" = round(tp2$GC))
|
||||
mgc2<-cbind(tp2_plasmid[, c("Count_SNPs", "Sample")], "GC" = round(tp2_plasmid$GC))
|
||||
ggplot(mgc2, aes(x=GC, y=Count_SNPs)) +
|
||||
stat_summary(fun = sum, geom="line", linewidth = 0.5) +
|
||||
facet_wrap(~Sample, nrow = 2) +
|
||||
ggtitle("Mismatches/GC") +
|
||||
facet_wrap(~Sample, nrow = 3) +
|
||||
xlab("ГЦ-состав, %")+
|
||||
ylab("Количество однонуклеотидных замен")+
|
||||
ggtitle("Количество однонуклеотидных замен в зависимости от ГЦ-состава прочтений") +
|
||||
geom_vline(xintercept = 50, linetype="dotted") +
|
||||
theme_bw()+
|
||||
theme(plot.title = element_text(hjust = 0.5))
|
||||
ggsave("../report/images_new_plasmid/mgc.png", width = 2250, height = 1500, units = "px")
|
||||
ggplot(mgc2, aes(x=GC, y=Count_SNPs)) +
|
||||
stat_summary(fun = mean, geom="line", size = 0.5) +
|
||||
facet_wrap(~Sample, nrow = 2) +
|
||||
ggtitle("Mismatches/GC") +
|
||||
facet_wrap(~Sample, nrow = 3) +
|
||||
xlab("ГЦ-состав, %")+
|
||||
ylab("Среднее число однонуклеотидных замен на одно прочтение")+
|
||||
ggtitle("Количество однонуклеотидных замен в зависимости от ГЦ-состава прочтения") +
|
||||
geom_vline(xintercept = 50, linetype="dotted") +
|
||||
theme_bw()+
|
||||
theme(plot.title = element_text(hjust = 0.5))
|
||||
@ -88,37 +98,41 @@ rm (mgc2)
|
||||
|
||||
mtype2 <- data.frame(matrix(ncol = 3, nrow = 0))
|
||||
colnames(mtype2)<- c("Sum", "type", "Sample")
|
||||
for (i in samples) {
|
||||
temp <- apply(tp2[, c(12:23)][tp2$Sample==i,], 2, sum)
|
||||
for (i in samples_ecoli) {
|
||||
temp <- apply(tp2_plasmid[, c(12:23)][tp2_plasmid$Sample==i,], 2, sum)
|
||||
temp <- data.frame("Sum" = temp)
|
||||
temp$type<-as.factor(colnames(tp2[, c(12:23)]))
|
||||
temp$type<-as.factor(colnames(tp2_plasmid[, c(12:23)]))
|
||||
temp<-cbind(temp, "Sample" = i)
|
||||
mtype2<-rbind(mtype2, temp)
|
||||
}
|
||||
ggplot(mtype2) +
|
||||
geom_line(aes(x = type, y = Sum, group = Sample)) +
|
||||
geom_point(aes(x = type, y = Sum)) +
|
||||
#facet_wrap(~Sample, nrow = 2, scales = "free_y") +
|
||||
facet_wrap(~Sample, nrow = 2, ) +
|
||||
ggtitle("Mismatch type") +
|
||||
#facet_wrap(~Sample, nrow = 3, scales = "free_y") +
|
||||
facet_wrap(~Sample, nrow = 3, ) +
|
||||
xlab("Тип однонуклеотидной замены")+
|
||||
ylab("Количество однонуклеотидных замен")+
|
||||
ggtitle("Частота типов однонуклеотидных замен") +
|
||||
theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1))
|
||||
ggsave("../report/images_new_plasmid/mt.png", width = 2250, height = 1500, units = "px")
|
||||
|
||||
mtype2_2 <- data.frame(matrix(ncol = 3, nrow = 0))
|
||||
colnames(mtype2_2)<- c("mean", "type", "Sample")
|
||||
for (i in samples) {
|
||||
temp <- apply(tp2[tp2$Sample==i, c(12:23)], 2, mean)
|
||||
for (i in samples_ecoli) {
|
||||
temp <- apply(tp2_plasmid[tp2_plasmid$Sample==i, c(12:23)], 2, mean)
|
||||
temp <- data.frame("mean" = temp)
|
||||
temp$type<-as.factor(colnames(tp2[, c(12:23)]))
|
||||
temp$type<-as.factor(colnames(tp2_plasmid[, c(12:23)]))
|
||||
temp<-cbind(temp, "Sample" = i)
|
||||
mtype2_2<-rbind(mtype2_2, temp)
|
||||
}
|
||||
ggplot(mtype2_2) +
|
||||
geom_line(aes(x = type, y = mean, group = Sample)) +
|
||||
geom_point(aes(x = type, y = mean)) +
|
||||
#facet_wrap(~Sample, nrow = 2, scales = "free_y") +
|
||||
facet_wrap(~Sample, nrow = 2) +
|
||||
ggtitle("Mismatch type") +
|
||||
#facet_wrap(~Sample, nrow = 3, scales = "free_y") +
|
||||
facet_wrap(~Sample, nrow = 3) +
|
||||
xlab("Тип однонуклеотидной замены")+
|
||||
ylab("Среднее по прочтению")+
|
||||
ggtitle("Частота типов однонуклеотидных замен") +
|
||||
theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1))
|
||||
ggsave("../report/images_new_plasmid/mt_mean.png", width = 2250, height = 1500, units = "px")
|
||||
rm(mtype2, mtype2_2)
|
||||
@ -127,8 +141,8 @@ rm(mtype2, mtype2_2)
|
||||
|
||||
delets2 <- data.frame(matrix(ncol = 2, nrow = 0))
|
||||
colnames(delets2)<- c("Nucleotide", "Sample")
|
||||
for (i in samples) {
|
||||
temp <- tp2[tp2$Sample==i,][,10]
|
||||
for (i in samples_ecoli) {
|
||||
temp <- tp2_plasmid[tp2_plasmid$Sample==i,][,10]
|
||||
temp <- unlist(strsplit(temp,""))
|
||||
temp<- temp[temp %in% c(letters, LETTERS)]
|
||||
temp<-cbind("Nucleotide"=temp, "Sample" = i)
|
||||
@ -138,9 +152,11 @@ for (i in samples) {
|
||||
}
|
||||
ggplot(delets2) +
|
||||
geom_bar(aes(x = Nucleotide), width = 0.4, fill = "blue", alpha = 0.5, col = "black") +
|
||||
facet_wrap(~Sample, nrow = 2) +
|
||||
facet_wrap(~Sample, nrow = 3) +
|
||||
#ylim(0, 1500) +
|
||||
ggtitle ("Deletions") +
|
||||
xlab("Нуклеотид")+
|
||||
ylab("Количество делеций")+
|
||||
ggtitle("Состав делеций") +
|
||||
theme(plot.title = element_text(hjust = 0.5))
|
||||
ggsave("../report/images_new_plasmid/delets.png", width = 2250, height = 1500, units = "px")
|
||||
rm (delets2)
|
||||
@ -149,8 +165,8 @@ rm (delets2)
|
||||
|
||||
inserts2 <- data.frame(matrix(ncol = 2, nrow = 0))
|
||||
colnames(inserts2)<- c("Nucleotide", "Sample")
|
||||
for (i in samples) {
|
||||
temp <- tp2[tp2$Sample==i,][,11]
|
||||
for (i in samples_ecoli) {
|
||||
temp <- tp2_plasmid[tp2_plasmid$Sample==i,][,11]
|
||||
temp <- unlist(strsplit(temp,""))
|
||||
temp<- temp[temp %in% c(letters, LETTERS)]
|
||||
temp<-cbind("Nucleotide"=temp, "Sample" = i)
|
||||
@ -159,9 +175,11 @@ for (i in samples) {
|
||||
inserts2<-rbind(temp, inserts2)
|
||||
}
|
||||
ggplot(inserts2) +
|
||||
ggtitle("Insertions") +
|
||||
xlab("Нуклеотид")+
|
||||
ylab("Количество вставок")+
|
||||
ggtitle("Состав вставок") +
|
||||
geom_bar(aes(x = Nucleotide), width = 0.4, fill = "blue", alpha = 0.5, col = "black") +
|
||||
facet_wrap(~Sample, nrow = 2) +
|
||||
facet_wrap(~Sample, nrow = 3) +
|
||||
#ylim(0, 2000) +
|
||||
theme(plot.title = element_text(hjust = 0.5))
|
||||
ggsave("../report/images_new_plasmid/inserts.png", width = 2250, height = 1500, units = "px")
|
||||
@ -171,8 +189,8 @@ rm (inserts2)
|
||||
|
||||
insert_mean2 <- data.frame(matrix(ncol = 7, nrow = 0))
|
||||
colnames(insert_mean2)<- c("Sample", "Min.", "1st Qu.", "Median", "Mean", "3rd Qu.", "Max.")
|
||||
for (i in samples) {
|
||||
temp<-subset(tp2, Mean_length_del!=0 & Sample == i)[,7]
|
||||
for (i in samples_ecoli) {
|
||||
temp<-subset(tp2_plasmid, Mean_length_del!=0 & Sample == i)[,7]
|
||||
temp<-cbind("Sample"=i, t(summary(temp)))
|
||||
insert_mean2<-rbind(insert_mean2, temp)
|
||||
}
|
||||
@ -182,12 +200,12 @@ insert_mean2[, 5]<-as.numeric(insert_mean2[, 5])
|
||||
insert_mean2[, 7]<-as.numeric(insert_mean2[, 7])
|
||||
ggplot(data.frame(insert_mean2), aes(x = factor(Sample), y = Mean), size = 0.5) +
|
||||
geom_pointrange(aes(ymin = 0, ymax = Max.)) +
|
||||
scale_y_continuous(name = "Length") +
|
||||
geom_point(aes(x = factor(Sample), y = 0), shape = 1) +
|
||||
geom_point(aes(x = factor(Sample), y = Max.), shape = 1) +
|
||||
ggtitle("Insertion mean length") +
|
||||
scale_x_discrete(name = "Sample") +
|
||||
theme(plot.title = element_text(hjust = 0.5))
|
||||
xlab("Образец")+
|
||||
ylab("Количество нуклеотидов")+
|
||||
ggtitle("Статистика длины вставки") +
|
||||
theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1))
|
||||
ggsave("../report/images_new_plasmid/iml.png", width = 2250, height = 1500, units = "px")
|
||||
rm (insert_mean2)
|
||||
|
||||
@ -195,8 +213,8 @@ rm (insert_mean2)
|
||||
|
||||
delete_mean2 <- data.frame(matrix(ncol = 7, nrow = 0))
|
||||
colnames(delete_mean2)<- c("Sample", "Min.", "1st Qu.", "Median", "Mean", "3rd Qu.", "Max.")
|
||||
for (i in samples) {
|
||||
temp<-subset(tp2, Mean_length_del!=0 & Sample == i)[,6]
|
||||
for (i in samples_ecoli) {
|
||||
temp<-subset(tp2_plasmid, Mean_length_del!=0 & Sample == i)[,6]
|
||||
temp<-cbind("Sample"=i, t(summary(temp)))
|
||||
delete_mean2<-rbind(delete_mean2, temp)
|
||||
}
|
||||
@ -208,29 +226,32 @@ ggplot(data.frame(delete_mean2), aes(x = factor(Sample), y = Mean), size = 0.5)
|
||||
geom_pointrange(aes(ymin = 0, ymax = Max.)) +
|
||||
geom_point(aes(x = factor(Sample), y = 0), shape = 1) +
|
||||
geom_point(aes(x = factor(Sample), y = Max.), shape = 1) +
|
||||
ggtitle("Deletion mean length") +
|
||||
scale_x_discrete(name = "Sample") +
|
||||
scale_y_continuous(name = "Length") +
|
||||
theme(plot.title = element_text(hjust = 0.5))
|
||||
xlab("Образец")+
|
||||
ylab("Количество нуклеотидов")+
|
||||
ggtitle("Статистика длины делеции") +
|
||||
theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1))
|
||||
ggsave("../report/images_new_plasmid/dml.png", width = 2250, height = 1500, units = "px")
|
||||
rm(delete_mean2)
|
||||
|
||||
#Number of mismatches per sample
|
||||
|
||||
ggplot(tp2, aes(x=Sample, y=Count_SNPs, group = 1)) +
|
||||
ggplot(tp2_plasmid, aes(x=Sample, y=Count_SNPs, group = 1)) +
|
||||
stat_summary(fun = sum, geom="bar", width = 0.7, fill = "white", color = "black", alpha = 0.7) +
|
||||
ggtitle("Mismatches per sample") +
|
||||
xlab("Образец")+
|
||||
ylab("Количество замен")+
|
||||
ggtitle("Количество однонуклеотидных замен") +
|
||||
#geom_vline(xintercept = 50, linetype="dotted") +
|
||||
theme(plot.title = element_text(hjust = 0.5))
|
||||
theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1))
|
||||
ggsave("../report/images_new_plasmid/mps_mean.png", width = 2250, height = 1500, units = "px")
|
||||
|
||||
|
||||
ggplot(tp2, aes(x=Sample, y=Count_SNPs, group = 1)) +
|
||||
stat_summary(fun = mean, geom="point") +
|
||||
stat_summary(fun = mean, geom="line") +
|
||||
ggtitle("Mismatches per sample") +
|
||||
ggplot(tp2_plasmid, aes(x=Sample, y=Count_SNPs, group = 1)) +
|
||||
stat_summary(fun = mean, geom="bar", width = 0.7, fill = "white", color = "black", alpha = 0.7) +
|
||||
xlab("Образец")+
|
||||
ylab("Среднее число замен")+
|
||||
ggtitle("Количество однонуклеотидных замен на одно прочтение") +
|
||||
#geom_vline(xintercept = 50, linetype="dotted") +
|
||||
theme(plot.title = element_text(hjust = 0.5))
|
||||
theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1))
|
||||
ggsave("../report/images_new_plasmid/mps.png", width = 2250, height = 1500, units = "px")
|
||||
|
||||
#Number of deletions per cycle of sequence
|
||||
@ -239,8 +260,8 @@ numbers_only <- function(x) !grepl("\\D", x)
|
||||
|
||||
dels2 <- data.frame(matrix(nrow = 0, ncol = 2))
|
||||
colnames(dels2)<-c("Num", "Sample")
|
||||
for (i in samples) {
|
||||
temp <- tp2[tp2$Sample==i,][,10]
|
||||
for (i in samples_ecoli) {
|
||||
temp <- tp2_plasmid[tp2_plasmid$Sample==i,][,10]
|
||||
temp <- unlist(strsplit(temp, split="|",fixed = TRUE))
|
||||
temp <- unlist(strsplit(temp,","))
|
||||
temp<- temp[numbers_only(temp)]
|
||||
@ -253,9 +274,11 @@ ggplot(dels2, aes(x = Num)) +
|
||||
#geom_density(stat = "count", alpha = 0.5) +
|
||||
stat_count(geom="line", position="identity") +
|
||||
stat_count(geom="point", position="identity", size = 1) +
|
||||
facet_wrap(~Sample, nrow = 2) +
|
||||
facet_wrap(~Sample, nrow = 3) +
|
||||
#ylim(0, 1500) +
|
||||
ggtitle ("Deletions per cycle") +
|
||||
xlab("Цикл прочтения, №")+
|
||||
ylab("Количество делеций")+
|
||||
ggtitle("Число делеций на цикл") +
|
||||
theme_bw()+
|
||||
theme(plot.title = element_text(hjust = 0.5))
|
||||
ggsave("../report/images_new_plasmid/dpc.png", width = 2250, height = 1500, units = "px")
|
||||
@ -265,8 +288,8 @@ rm(dels2)
|
||||
|
||||
ins2 <- data.frame(matrix(nrow = 0, ncol = 2))
|
||||
colnames(ins2)<-c("Num", "Sample")
|
||||
for (i in samples) {
|
||||
temp <- tp2[tp2$Sample==i,][,11]
|
||||
for (i in samples_ecoli) {
|
||||
temp <- tp2_plasmid[tp2_plasmid$Sample==i,][,11]
|
||||
temp <- unlist(strsplit(temp, split="|",fixed = TRUE))
|
||||
temp <- unlist(strsplit(temp,","))
|
||||
temp<- temp[numbers_only(temp)]
|
||||
@ -278,9 +301,11 @@ ins2$Num<-as.numeric(ins2$Num)
|
||||
ggplot(ins2, aes(x = Num)) +
|
||||
stat_count(geom="line", position="identity") +
|
||||
stat_count(geom="point", position="identity", size = 1) +
|
||||
facet_wrap(~Sample, nrow = 2, scales = "free_y") +
|
||||
facet_wrap(~Sample, nrow = 3) +
|
||||
#ylim(0, 1500) +
|
||||
ggtitle ("Insertions per cycle") +
|
||||
xlab("Цикл прочтения, №")+
|
||||
ylab("Количество вставок")+
|
||||
ggtitle("Число вставок на цикл") +
|
||||
scale_x_continuous(breaks=seq(0, 150, 25))+
|
||||
theme_bw()+
|
||||
theme(plot.title = element_text(hjust = 0.5))
|
||||
@ -291,8 +316,8 @@ rm (ins2)
|
||||
|
||||
parse_delets2 <- data.frame(matrix(nrow = 0, ncol = 2))
|
||||
colnames(parse_delets2)<-c("Num", "Sample")
|
||||
for (i in samples) {
|
||||
temp <- tp2[tp2$Sample==i,][,10]
|
||||
for (i in samples_ecoli) {
|
||||
temp <- tp2_plasmid[tp2_plasmid$Sample==i,][,10]
|
||||
temp <- unlist(strsplit(temp,split="|",fixed = TRUE))
|
||||
temp<-cbind("Num"=temp, "Sample" = i)
|
||||
temp<-data.frame(temp)
|
||||
@ -300,9 +325,11 @@ for (i in samples) {
|
||||
}
|
||||
ggplot(parse_delets2, aes(x=Sample, group = 1)) +
|
||||
stat_count(geom="bar", width = 0.7, fill = "white", color = "black", alpha = 0.7) +
|
||||
ggtitle("Deletions per sample") +
|
||||
xlab("Образец")+
|
||||
ylab("Количество делеций")+
|
||||
ggtitle("Число делеций на образец") +
|
||||
#geom_vline(xintercept = 50, linetype="dotted") +
|
||||
theme(plot.title = element_text(hjust = 0.5))
|
||||
theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1))
|
||||
ggsave("../report/images_new_plasmid/dps.png", width = 2250, height = 1500, units = "px")
|
||||
rm (parse_delets2)
|
||||
|
||||
@ -310,8 +337,8 @@ rm (parse_delets2)
|
||||
|
||||
parse_inserts2 <- data.frame(matrix(nrow = 0, ncol = 2))
|
||||
colnames(parse_inserts2)<-c("Num", "Sample")
|
||||
for (i in samples) {
|
||||
temp <- tp2[tp2$Sample==i,][,11]
|
||||
for (i in samples_ecoli) {
|
||||
temp <- tp2_plasmid[tp2_plasmid$Sample==i,][,11]
|
||||
temp <- unlist(strsplit(temp,split="|",fixed = TRUE))
|
||||
temp<-cbind("Num"=temp, "Sample" = i)
|
||||
temp<-data.frame(temp)
|
||||
@ -319,9 +346,11 @@ for (i in samples) {
|
||||
}
|
||||
ggplot(parse_inserts2, aes(x=Sample, group = 1)) +
|
||||
stat_count(geom="bar", width = 0.7, fill = "white", color = "black", alpha = 0.7) +
|
||||
ggtitle("Insertions per sample") +
|
||||
xlab("Образец")+
|
||||
ylab("Количество вставок")+
|
||||
ggtitle("Число вставок на образец") +
|
||||
#geom_vline(xintercept = 50, linetype="dotted") +
|
||||
theme(plot.title = element_text(hjust = 0.5))
|
||||
theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1))
|
||||
ggsave("../report/images_new_plasmid/ips.png", width = 2250, height = 1500, units = "px")
|
||||
rm (parse_inserts2, temp)
|
||||
|
||||
@ -329,8 +358,8 @@ rm (parse_inserts2, temp)
|
||||
|
||||
insert_motifs2 <- data.frame(matrix(nrow = 0, ncol = 2))
|
||||
colnames(insert_motifs2)<-c("Num", "Sample")
|
||||
for (i in samples) {
|
||||
temp <- tp2[tp2$Sample==i,][,11]
|
||||
for (i in samples_ecoli) {
|
||||
temp <- tp2_plasmid[tp2_plasmid$Sample==i,][,11]
|
||||
temp <- unlist(strsplit(temp, split="|",fixed = TRUE))
|
||||
temp <- unlist(strsplit(temp,","))
|
||||
temp<- temp[!numbers_only(temp)]
|
||||
@ -341,17 +370,19 @@ for (i in samples) {
|
||||
ggplot(insert_motifs2, aes(x = Num)) +
|
||||
stat_count(geom='bar', width = 0.7, fill = "white", color = "black", alpha = 0.7) +
|
||||
facet_wrap(~Sample, nrow=4, scales = "free_y") +
|
||||
ggtitle("Insertion motifs per sample") +
|
||||
xlab("Мотив")+
|
||||
ylab("Количество")+
|
||||
ggtitle("Представленность мотивов вставок в образцах") +
|
||||
theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1))
|
||||
ggsave("../report/images_new_plasmid/imps.png", width = 3250, height = 2500, units = "px")
|
||||
ggsave("../report/images_new_plasmid/imps.png", width = 7550, height = 2500, units = "px")
|
||||
rm (insert_motifs2, temp)
|
||||
|
||||
#Frequency of deletion motifs per sample
|
||||
|
||||
delete_motifs2 <- data.frame(matrix(nrow = 0, ncol = 2))
|
||||
colnames(delete_motifs2)<-c("Num", "Sample")
|
||||
for (i in samples) {
|
||||
temp <- tp2[tp2$Sample==i,][,10]
|
||||
for (i in samples_ecoli) {
|
||||
temp <- tp2_plasmid[tp2_plasmid$Sample==i,][,10]
|
||||
temp <- unlist(strsplit(temp, split="|",fixed = TRUE))
|
||||
temp <- unlist(strsplit(temp,","))
|
||||
temp<- temp[!numbers_only(temp)]
|
||||
@ -362,7 +393,9 @@ for (i in samples) {
|
||||
ggplot(delete_motifs2, aes(x = Num)) +
|
||||
stat_count(geom='bar', width = 0.7, fill = "white", color = "black", alpha = 0.7) +
|
||||
facet_wrap(~Sample, nrow=4, scales = "free_y") +
|
||||
ggtitle("Deletions motifs per sample") +
|
||||
xlab("Мотив")+
|
||||
ylab("Количество")+
|
||||
ggtitle("Представленность мотивов делеций в образцах") +
|
||||
theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1))
|
||||
ggsave("../report/images_new_plasmid/dmps.png", width = 4550, height = 2500, units = "px")
|
||||
ggsave("../report/images_new_plasmid/dmps.png", width = 7550, height = 2500, units = "px")
|
||||
rm (delete_motifs2, temp)
|
BIN
qr-code.png
Normal file
BIN
qr-code.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 7.9 KiB |
Loading…
Reference in New Issue
Block a user