library(ggplot2) library(stringr) ##### #This part of scripts produces the plots of alignment data. Only genome-aligned reverse reads were taken here. #For every sample was created a set of reads of equal capacity (min between all sample reads). ##### #Uploading data in R temp_name<-read.table("names_of_custom_genome.txt") samples_ecoli_chr <- c() for (i in temp_name[,c("V1")]) { samples_ecoli_chr<-c(samples_ecoli_chr, gsub("_genome_mapq20_table.txt", "", gsub("custom_", "", i))) } tp2_genome <- data.frame(matrix(ncol = 24, nrow = 0)) colnames(tp2_genome) <- unlist(strsplit("ID;Read;Count_SNPs;Cycles;GC;Mean_length_del;Mean_length_insert;Read_length;Align_length;Dels;Inserts;A>C;A>G;A>T;C>A;C>G;C>T;G>A;G>C;G>T;T>A;T>C;T>G",";")) for (i in temp_name[,c("V1")]) { temp<-read.csv(i, sep=";", header=TRUE) temp<-subset(temp, Read==2) temp<-temp[sample(nrow(temp), 500000),] temp<-cbind(temp, "Sample"=gsub("_genome_mapq20_table.txt", "", gsub("custom_", "", i))) tp2_genome<-rbind(tp2_genome, temp) } rm(temp, temp_name) tp2_genome$Sample<-as.factor(tp2_genome$Sample) dir.create("../report/images_new_genome") #GC-content ggplot(data = tp2_genome, aes(x = GC)) + geom_density(stat = "density", fill = "pink", alpha = 0.5) + facet_wrap(~Sample, nrow = 3) + geom_vline(xintercept = 50, linetype="dotted") + xlab("ГЦ-состав, %") + ylab("Доля прочтений") + xlim(25, 75) + ylim(0, 0.15) + theme(plot.title = element_text(hjust = 0.5)) ggsave("../report/images_new_genome/gc.png", width = 2250, height = 1500, units = "px") #Mismatches per cycle of sequence df2<-data.frame(matrix(nrow = 0, ncol = 2)) colnames(df2)<-c("Cycles", "Sample") for (i in samples_ecoli_chr) { cyc <-subset(tp2_genome, Sample == i)[,4] cyc <- as.numeric(unlist(strsplit(cyc,","))) df2<-rbind(df2, cbind("Cycles" = cyc, "Sample" =i)) } df2$Sample<-as.factor(df2$Sample) df2$Cycles<-as.numeric(df2$Cycles) rm (cyc, i) ggplot(df2, aes(x = Cycles)) + geom_density(stat = "count", fill = "blue", alpha = 0.5) + #geom_line(stat = "count", fill = "blue", alpha = 0.5) + facet_wrap(~Sample, nrow = 3) + xlab("Цикл прочтения, №")+ ylab("Количество замен")+ ggtitle("Количество однонуклеотидных замен по циклам") + theme(plot.title = element_text(hjust = 0.5)) ggsave("../report/images_new_genome/mpc.png", width = 2250, height = 1500, units = "px") ggplot(df2, aes(x = Cycles)) + geom_density(stat = "density", fill = "blue", alpha = 0.5) + facet_wrap(~Sample, nrow = 3) + xlab("Цикл прочтения, №")+ ylab("Доля от общего количества")+ ggtitle("Количество однонуклеотидных замен по циклам") + theme(plot.title = element_text(hjust = 0.5)) ggsave("../report/images_new_genome/mpc_density.png", width = 2250, height = 1500, units = "px") rm (df2) #Mismatches depending on GC-content mgc2<-cbind(tp2_genome[, c("Count_SNPs", "Sample")], "GC" = round(tp2_genome$GC)) ggplot(mgc2, aes(x=GC, y=Count_SNPs)) + stat_summary(fun = sum, geom="line", linewidth = 0.5) + facet_wrap(~Sample, nrow = 3) + xlab("ГЦ-состав, %")+ ylab("Количество однонуклеотидных замен")+ ggtitle("Количество однонуклеотидных замен в зависимости от ГЦ-состава прочтений") + geom_vline(xintercept = 50, linetype="dotted") + theme_bw()+ theme(plot.title = element_text(hjust = 0.5)) ggsave("../report/images_new_genome/mgc.png", width = 2250, height = 1500, units = "px") ggplot(mgc2, aes(x=GC, y=Count_SNPs)) + stat_summary(fun = mean, geom="line", size = 0.5) + facet_wrap(~Sample, nrow = 3) + xlab("ГЦ-состав, %")+ ylab("Среднее число однонуклеотидных замен на одно прочтение")+ ggtitle("Количество однонуклеотидных замен в зависимости от ГЦ-состава прочтения") + geom_vline(xintercept = 50, linetype="dotted") + theme_bw()+ theme(plot.title = element_text(hjust = 0.5)) ggsave("../report/images_new_genome/mgc_mean.png", width = 2250, height = 1500, units = "px") rm (mgc2) #Frequency of different types of mismatches mtype2 <- data.frame(matrix(ncol = 3, nrow = 0)) colnames(mtype2)<- c("Sum", "type", "Sample") for (i in samples_ecoli_chr) { temp <- apply(tp2_genome[, c(12:23)][tp2_genome$Sample==i,], 2, sum) temp <- data.frame("Sum" = temp) temp$type<-as.factor(colnames(tp2_genome[, c(12:23)])) temp<-cbind(temp, "Sample" = i) mtype2<-rbind(mtype2, temp) } ggplot(mtype2) + geom_line(aes(x = type, y = Sum, group = Sample)) + geom_point(aes(x = type, y = Sum)) + #facet_wrap(~Sample, nrow = 3, scales = "free_y") + facet_wrap(~Sample, nrow = 3, ) + xlab("Тип однонуклеотидной замены")+ ylab("Количество однонуклеотидных замен")+ ggtitle("Частота типов однонуклеотидных замен") + theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1)) ggsave("../report/images_new_genome/mt.png", width = 2250, height = 1500, units = "px") mtype2_2 <- data.frame(matrix(ncol = 3, nrow = 0)) colnames(mtype2_2)<- c("mean", "type", "Sample") for (i in samples_ecoli_chr) { temp <- apply(tp2_genome[tp2_genome$Sample==i, c(12:23)], 2, mean) temp <- data.frame("mean" = temp) temp$type<-as.factor(colnames(tp2_genome[, c(12:23)])) temp<-cbind(temp, "Sample" = i) mtype2_2<-rbind(mtype2_2, temp) } ggplot(mtype2_2) + geom_line(aes(x = type, y = mean, group = Sample)) + geom_point(aes(x = type, y = mean)) + #facet_wrap(~Sample, nrow = 3, scales = "free_y") + facet_wrap(~Sample, nrow = 3) + xlab("Тип однонуклеотидной замены")+ ylab("Среднее по прочтению")+ ggtitle("Частота типов однонуклеотидных замен") + theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1)) ggsave("../report/images_new_genome/mt_mean.png", width = 2250, height = 1500, units = "px") rm(mtype2, mtype2_2) #Number of deletions per sample delets2 <- data.frame(matrix(ncol = 2, nrow = 0)) colnames(delets2)<- c("Nucleotide", "Sample") for (i in samples_ecoli_chr) { temp <- tp2_genome[tp2_genome$Sample==i,][,10] temp <- unlist(strsplit(temp,"")) temp<- temp[temp %in% c(letters, LETTERS)] temp<-cbind("Nucleotide"=temp, "Sample" = i) temp<-data.frame(temp) temp$Nucleotide <- as.factor(temp$Nucleotide) delets2<-rbind(temp, delets2) } ggplot(delets2) + geom_bar(aes(x = Nucleotide), width = 0.4, fill = "blue", alpha = 0.5, col = "black") + facet_wrap(~Sample, nrow = 3) + #ylim(0, 1500) + xlab("Нуклеотид")+ ylab("Количество делеций")+ ggtitle("Состав делеций") + theme(plot.title = element_text(hjust = 0.5)) ggsave("../report/images_new_genome/delets.png", width = 2250, height = 1500, units = "px") rm (delets2) #Number of insertions per sample inserts2 <- data.frame(matrix(ncol = 2, nrow = 0)) colnames(inserts2)<- c("Nucleotide", "Sample") for (i in samples_ecoli_chr) { temp <- tp2_genome[tp2_genome$Sample==i,][,11] temp <- unlist(strsplit(temp,"")) temp<- temp[temp %in% c(letters, LETTERS)] temp<-cbind("Nucleotide"=temp, "Sample" = i) temp<-data.frame(temp) temp$Nucleotide <- as.factor(temp$Nucleotide) inserts2<-rbind(temp, inserts2) } ggplot(inserts2) + xlab("Нуклеотид")+ ylab("Количество вставок")+ ggtitle("Состав вставок") + geom_bar(aes(x = Nucleotide), width = 0.4, fill = "blue", alpha = 0.5, col = "black") + facet_wrap(~Sample, nrow = 3) + #ylim(0, 2000) + theme(plot.title = element_text(hjust = 0.5)) ggsave("../report/images_new_genome/inserts.png", width = 2250, height = 1500, units = "px") rm (inserts2) #Mean, min and max values of insertions per sample insert_mean2 <- data.frame(matrix(ncol = 7, nrow = 0)) colnames(insert_mean2)<- c("Sample", "Min.", "1st Qu.", "Median", "Mean", "3rd Qu.", "Max.") for (i in samples_ecoli_chr) { temp<-subset(tp2_genome, Mean_length_del!=0 & Sample == i)[,7] temp<-cbind("Sample"=i, t(summary(temp))) insert_mean2<-rbind(insert_mean2, temp) } insert_mean2[, 2]<-as.numeric(insert_mean2[, 2]) insert_mean2[, 4]<-as.numeric(insert_mean2[, 4]) insert_mean2[, 5]<-as.numeric(insert_mean2[, 5]) insert_mean2[, 7]<-as.numeric(insert_mean2[, 7]) ggplot(data.frame(insert_mean2), aes(x = factor(Sample), y = Mean), size = 0.5) + geom_pointrange(aes(ymin = 0, ymax = Max.)) + geom_point(aes(x = factor(Sample), y = 0), shape = 1) + geom_point(aes(x = factor(Sample), y = Max.), shape = 1) + xlab("Образец")+ ylab("Количество нуклеотидов")+ ggtitle("Статистика длины вставки") + theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1)) ggsave("../report/images_new_genome/iml.png", width = 2250, height = 1500, units = "px") rm (insert_mean2) #Mean, min and max values of deletions per sample delete_mean2 <- data.frame(matrix(ncol = 7, nrow = 0)) colnames(delete_mean2)<- c("Sample", "Min.", "1st Qu.", "Median", "Mean", "3rd Qu.", "Max.") for (i in samples_ecoli_chr) { temp<-subset(tp2_genome, Mean_length_del!=0 & Sample == i)[,6] temp<-cbind("Sample"=i, t(summary(temp))) delete_mean2<-rbind(delete_mean2, temp) } delete_mean2[, 2]<-as.numeric(delete_mean2[, 2]) delete_mean2[, 4]<-as.numeric(delete_mean2[, 4]) delete_mean2[, 5]<-as.numeric(delete_mean2[, 5]) delete_mean2[, 7]<-as.numeric(delete_mean2[, 7]) ggplot(data.frame(delete_mean2), aes(x = factor(Sample), y = Mean), size = 0.5) + geom_pointrange(aes(ymin = 0, ymax = Max.)) + geom_point(aes(x = factor(Sample), y = 0), shape = 1) + geom_point(aes(x = factor(Sample), y = Max.), shape = 1) + xlab("Образец")+ ylab("Количество нуклеотидов")+ ggtitle("Статистика длины делеции") + theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1)) ggsave("../report/images_new_genome/dml.png", width = 2250, height = 1500, units = "px") rm(delete_mean2) #Number of mismatches per sample ggplot(tp2_genome, aes(x=Sample, y=Count_SNPs, group = 1)) + stat_summary(fun = sum, geom="bar", width = 0.7, fill = "white", color = "black", alpha = 0.7) + xlab("Образец")+ ylab("Количество замен")+ ggtitle("Количество однонуклеотидных замен") + #geom_vline(xintercept = 50, linetype="dotted") + theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1)) ggsave("../report/images_new_genome/mps_mean.png", width = 2250, height = 1500, units = "px") ggplot(tp2_genome, aes(x=Sample, y=Count_SNPs, group = 1)) + stat_summary(fun = mean, geom="bar", width = 0.7, fill = "white", color = "black", alpha = 0.7) + xlab("Образец")+ ylab("Среднее число замен")+ ggtitle("Количество однонуклеотидных замен на одно прочтение") + #geom_vline(xintercept = 50, linetype="dotted") + theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1)) ggsave("../report/images_new_genome/mps.png", width = 2250, height = 1500, units = "px") #Number of deletions per cycle of sequence numbers_only <- function(x) !grepl("\\D", x) dels2 <- data.frame(matrix(nrow = 0, ncol = 2)) colnames(dels2)<-c("Num", "Sample") for (i in samples_ecoli_chr) { temp <- tp2_genome[tp2_genome$Sample==i,][,10] temp <- unlist(strsplit(temp, split="|",fixed = TRUE)) temp <- unlist(strsplit(temp,",")) temp<- temp[numbers_only(temp)] temp<-cbind("Num"=temp, "Sample" = i) temp<-data.frame(temp) dels2<-rbind(temp, dels2) } dels2$Num<-as.numeric(dels2$Num) ggplot(dels2, aes(x = Num)) + #geom_density(stat = "count", alpha = 0.5) + stat_count(geom="line", position="identity") + stat_count(geom="point", position="identity", size = 1) + facet_wrap(~Sample, nrow = 3) + #ylim(0, 1500) + xlab("Цикл прочтения, №")+ ylab("Количество делеций")+ ggtitle("Число делеций на цикл") + theme_bw()+ theme(plot.title = element_text(hjust = 0.5)) ggsave("../report/images_new_genome/dpc.png", width = 2250, height = 1500, units = "px") rm(dels2) #Number of insertions per cycle of sequence ins2 <- data.frame(matrix(nrow = 0, ncol = 2)) colnames(ins2)<-c("Num", "Sample") for (i in samples_ecoli_chr) { temp <- tp2_genome[tp2_genome$Sample==i,][,11] temp <- unlist(strsplit(temp, split="|",fixed = TRUE)) temp <- unlist(strsplit(temp,",")) temp<- temp[numbers_only(temp)] temp<-cbind("Num"=temp, "Sample" = i) temp<-data.frame(temp) ins2<-rbind(temp, ins2) } ins2$Num<-as.numeric(ins2$Num) ggplot(ins2, aes(x = Num)) + stat_count(geom="line", position="identity") + stat_count(geom="point", position="identity", size = 1) + facet_wrap(~Sample, nrow = 3) + #ylim(0, 1500) + xlab("Цикл прочтения, №")+ ylab("Количество вставок")+ ggtitle("Число вставок на цикл") + scale_x_continuous(breaks=seq(0, 150, 25))+ theme_bw()+ theme(plot.title = element_text(hjust = 0.5)) ggsave("../report/images_new_genome/ipc.png", width = 2250, height = 1500, units = "px") rm (ins2) #Number of deletions per sample parse_delets2 <- data.frame(matrix(nrow = 0, ncol = 2)) colnames(parse_delets2)<-c("Num", "Sample") for (i in samples_ecoli_chr) { temp <- tp2_genome[tp2_genome$Sample==i,][,10] temp <- unlist(strsplit(temp,split="|",fixed = TRUE)) temp<-cbind("Num"=temp, "Sample" = i) temp<-data.frame(temp) parse_delets2<-rbind(temp, parse_delets2) } ggplot(parse_delets2, aes(x=Sample, group = 1)) + stat_count(geom="bar", width = 0.7, fill = "white", color = "black", alpha = 0.7) + xlab("Образец")+ ylab("Количество делеций")+ ggtitle("Число делеций на образец") + #geom_vline(xintercept = 50, linetype="dotted") + theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1)) ggsave("../report/images_new_genome/dps.png", width = 2250, height = 1500, units = "px") rm (parse_delets2) #Number of insertions per sample parse_inserts2 <- data.frame(matrix(nrow = 0, ncol = 2)) colnames(parse_inserts2)<-c("Num", "Sample") for (i in samples_ecoli_chr) { temp <- tp2_genome[tp2_genome$Sample==i,][,11] temp <- unlist(strsplit(temp,split="|",fixed = TRUE)) temp<-cbind("Num"=temp, "Sample" = i) temp<-data.frame(temp) parse_inserts2<-rbind(temp, parse_inserts2) } ggplot(parse_inserts2, aes(x=Sample, group = 1)) + stat_count(geom="bar", width = 0.7, fill = "white", color = "black", alpha = 0.7) + xlab("Образец")+ ylab("Количество вставок")+ ggtitle("Число вставок на образец") + #geom_vline(xintercept = 50, linetype="dotted") + theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1)) ggsave("../report/images_new_genome/ips.png", width = 2250, height = 1500, units = "px") rm (parse_inserts2, temp) #Frequency of insertion motifs per sample insert_motifs2 <- data.frame(matrix(nrow = 0, ncol = 2)) colnames(insert_motifs2)<-c("Num", "Sample") for (i in samples_ecoli_chr) { temp <- tp2_genome[tp2_genome$Sample==i,][,11] temp <- unlist(strsplit(temp, split="|",fixed = TRUE)) temp <- unlist(strsplit(temp,",")) temp<- temp[!numbers_only(temp)] temp<-cbind("Num"=temp, "Sample" = i) temp<-data.frame(temp) insert_motifs2<-rbind(temp, insert_motifs2) } ggplot(insert_motifs2, aes(x = Num)) + stat_count(geom='bar', width = 0.7, fill = "white", color = "black", alpha = 0.7) + facet_wrap(~Sample, nrow=4, scales = "free_y") + xlab("Мотив")+ ylab("Количество")+ ggtitle("Представленность мотивов вставок в образцах") + theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1)) ggsave("../report/images_new_genome/imps.png", width = 10550, height = 2500, units = "px") rm (insert_motifs2, temp) #Frequency of deletion motifs per sample delete_motifs2 <- data.frame(matrix(nrow = 0, ncol = 2)) colnames(delete_motifs2)<-c("Num", "Sample") for (i in samples_ecoli_chr) { temp <- tp2_genome[tp2_genome$Sample==i,][,10] temp <- unlist(strsplit(temp, split="|",fixed = TRUE)) temp <- unlist(strsplit(temp,",")) temp<- temp[!numbers_only(temp)] temp<-cbind("Num"=temp, "Sample" = i) temp<-data.frame(temp) delete_motifs2<-rbind(temp, delete_motifs2) } ggplot(delete_motifs2, aes(x = Num)) + stat_count(geom='bar', width = 0.7, fill = "white", color = "black", alpha = 0.7) + facet_wrap(~Sample, nrow=4, scales = "free_y") + xlab("Мотив")+ ylab("Количество")+ ggtitle("Представленность мотивов делеций в образцах") + theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1)) ggsave("../report/images_new_genome/dmps.png", width = 10550, height = 2500, units = "px") rm (delete_motifs2, temp)