diff --git a/Supplementary/Sup_ECOLI_genome_main.pdf b/Supplementary/Sup_ECOLI_genome_main.pdf new file mode 100644 index 0000000..44b3ed8 Binary files /dev/null and b/Supplementary/Sup_ECOLI_genome_main.pdf differ diff --git a/Supplementary/Sup_ECOLI_plasmid_main.pdf b/Supplementary/Sup_ECOLI_plasmid_main.pdf new file mode 100644 index 0000000..aa7568d Binary files /dev/null and b/Supplementary/Sup_ECOLI_plasmid_main.pdf differ diff --git a/Supplementary/Sup_ZVL_main.pdf b/Supplementary/Sup_ZVL_main.pdf new file mode 100644 index 0000000..e7ba95b Binary files /dev/null and b/Supplementary/Sup_ZVL_main.pdf differ diff --git a/Supplementary/Sup_ZVL_pcr_free.pdf b/Supplementary/Sup_ZVL_pcr_free.pdf new file mode 100644 index 0000000..0a71031 Binary files /dev/null and b/Supplementary/Sup_ZVL_pcr_free.pdf differ diff --git a/chromosome.R b/chromosome.R new file mode 100644 index 0000000..346fde0 --- /dev/null +++ b/chromosome.R @@ -0,0 +1,401 @@ +library(ggplot2) +library(stringr) +##### +#This part of scripts produces the plots of alignment data. Only genome-aligned reverse reads were taken here. +#For every sample was created a set of reads of equal capacity (min between all sample reads). +##### + +#Uploading data in R + +temp_name<-read.table("names_of_custom_genome.txt") +samples_ecoli_chr <- c() +for (i in temp_name[,c("V1")]) { + samples_ecoli_chr<-c(samples_ecoli_chr, gsub("_genome_mapq20_table.txt", "", gsub("custom_", "", i))) +} +tp2_genome <- data.frame(matrix(ncol = 24, nrow = 0)) +colnames(tp2_genome) <- unlist(strsplit("ID;Read;Count_SNPs;Cycles;GC;Mean_length_del;Mean_length_insert;Read_length;Align_length;Dels;Inserts;A>C;A>G;A>T;C>A;C>G;C>T;G>A;G>C;G>T;T>A;T>C;T>G",";")) +for (i in temp_name[,c("V1")]) { + temp<-read.csv(i, sep=";", header=TRUE) + temp<-subset(temp, Read==2) + temp<-temp[sample(nrow(temp), 500000),] + temp<-cbind(temp, "Sample"=gsub("_genome_mapq20_table.txt", "", gsub("custom_", "", i))) + tp2_genome<-rbind(tp2_genome, temp) +} +rm(temp, temp_name) +tp2_genome$Sample<-as.factor(tp2_genome$Sample) +dir.create("../report/images_new_genome") + +#GC-content + +ggplot(data = tp2_genome, aes(x = GC)) + + geom_density(stat = "density", fill = "pink", alpha = 0.5) + + facet_wrap(~Sample, nrow = 3) + + geom_vline(xintercept = 50, linetype="dotted") + + xlab("ГЦ-состав, %") + + ylab("Доля прочтений") + + xlim(25, 75) + + ylim(0, 0.15) + + theme(plot.title = element_text(hjust = 0.5)) +ggsave("../report/images_new_genome/gc.png", width = 2250, height = 1500, units = "px") + +#Mismatches per cycle of sequence + +df2<-data.frame(matrix(nrow = 0, ncol = 2)) +colnames(df2)<-c("Cycles", "Sample") +for (i in samples_ecoli_chr) { + cyc <-subset(tp2_genome, Sample == i)[,4] + cyc <- as.numeric(unlist(strsplit(cyc,","))) + df2<-rbind(df2, cbind("Cycles" = cyc, "Sample" =i)) +} +df2$Sample<-as.factor(df2$Sample) +df2$Cycles<-as.numeric(df2$Cycles) +rm (cyc, i) +ggplot(df2, aes(x = Cycles)) + + geom_density(stat = "count", fill = "blue", alpha = 0.5) + + #geom_line(stat = "count", fill = "blue", alpha = 0.5) + + facet_wrap(~Sample, nrow = 3) + + xlab("Цикл прочтения, №")+ + ylab("Количество замен")+ + ggtitle("Количество однонуклеотидных замен по циклам") + + theme(plot.title = element_text(hjust = 0.5)) +ggsave("../report/images_new_genome/mpc.png", width = 2250, height = 1500, units = "px") +ggplot(df2, aes(x = Cycles)) + + geom_density(stat = "density", fill = "blue", alpha = 0.5) + + facet_wrap(~Sample, nrow = 3) + + xlab("Цикл прочтения, №")+ + ylab("Доля от общего количества")+ + ggtitle("Количество однонуклеотидных замен по циклам") + + theme(plot.title = element_text(hjust = 0.5)) +ggsave("../report/images_new_genome/mpc_density.png", width = 2250, height = 1500, units = "px") +rm (df2) + +#Mismatches depending on GC-content + +mgc2<-cbind(tp2_genome[, c("Count_SNPs", "Sample")], "GC" = round(tp2_genome$GC)) +ggplot(mgc2, aes(x=GC, y=Count_SNPs)) + + stat_summary(fun = sum, geom="line", linewidth = 0.5) + + facet_wrap(~Sample, nrow = 3) + + xlab("ГЦ-состав, %")+ + ylab("Количество однонуклеотидных замен")+ + ggtitle("Количество однонуклеотидных замен в зависимости от ГЦ-состава прочтений") + + geom_vline(xintercept = 50, linetype="dotted") + + theme_bw()+ + theme(plot.title = element_text(hjust = 0.5)) +ggsave("../report/images_new_genome/mgc.png", width = 2250, height = 1500, units = "px") +ggplot(mgc2, aes(x=GC, y=Count_SNPs)) + + stat_summary(fun = mean, geom="line", size = 0.5) + + facet_wrap(~Sample, nrow = 3) + + xlab("ГЦ-состав, %")+ + ylab("Среднее число однонуклеотидных замен на одно прочтение")+ + ggtitle("Количество однонуклеотидных замен в зависимости от ГЦ-состава прочтения") + + geom_vline(xintercept = 50, linetype="dotted") + + theme_bw()+ + theme(plot.title = element_text(hjust = 0.5)) +ggsave("../report/images_new_genome/mgc_mean.png", width = 2250, height = 1500, units = "px") +rm (mgc2) + +#Frequency of different types of mismatches + +mtype2 <- data.frame(matrix(ncol = 3, nrow = 0)) +colnames(mtype2)<- c("Sum", "type", "Sample") +for (i in samples_ecoli_chr) { + temp <- apply(tp2_genome[, c(12:23)][tp2_genome$Sample==i,], 2, sum) + temp <- data.frame("Sum" = temp) + temp$type<-as.factor(colnames(tp2_genome[, c(12:23)])) + temp<-cbind(temp, "Sample" = i) + mtype2<-rbind(mtype2, temp) +} +ggplot(mtype2) + + geom_line(aes(x = type, y = Sum, group = Sample)) + + geom_point(aes(x = type, y = Sum)) + + #facet_wrap(~Sample, nrow = 3, scales = "free_y") + + facet_wrap(~Sample, nrow = 3, ) + + xlab("Тип однонуклеотидной замены")+ + ylab("Количество однонуклеотидных замен")+ + ggtitle("Частота типов однонуклеотидных замен") + + theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1)) +ggsave("../report/images_new_genome/mt.png", width = 2250, height = 1500, units = "px") + +mtype2_2 <- data.frame(matrix(ncol = 3, nrow = 0)) +colnames(mtype2_2)<- c("mean", "type", "Sample") +for (i in samples_ecoli_chr) { + temp <- apply(tp2_genome[tp2_genome$Sample==i, c(12:23)], 2, mean) + temp <- data.frame("mean" = temp) + temp$type<-as.factor(colnames(tp2_genome[, c(12:23)])) + temp<-cbind(temp, "Sample" = i) + mtype2_2<-rbind(mtype2_2, temp) +} +ggplot(mtype2_2) + + geom_line(aes(x = type, y = mean, group = Sample)) + + geom_point(aes(x = type, y = mean)) + + #facet_wrap(~Sample, nrow = 3, scales = "free_y") + + facet_wrap(~Sample, nrow = 3) + + xlab("Тип однонуклеотидной замены")+ + ylab("Среднее по прочтению")+ + ggtitle("Частота типов однонуклеотидных замен") + + theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1)) +ggsave("../report/images_new_genome/mt_mean.png", width = 2250, height = 1500, units = "px") +rm(mtype2, mtype2_2) + +#Number of deletions per sample + +delets2 <- data.frame(matrix(ncol = 2, nrow = 0)) +colnames(delets2)<- c("Nucleotide", "Sample") +for (i in samples_ecoli_chr) { + temp <- tp2_genome[tp2_genome$Sample==i,][,10] + temp <- unlist(strsplit(temp,"")) + temp<- temp[temp %in% c(letters, LETTERS)] + temp<-cbind("Nucleotide"=temp, "Sample" = i) + temp<-data.frame(temp) + temp$Nucleotide <- as.factor(temp$Nucleotide) + delets2<-rbind(temp, delets2) +} +ggplot(delets2) + + geom_bar(aes(x = Nucleotide), width = 0.4, fill = "blue", alpha = 0.5, col = "black") + + facet_wrap(~Sample, nrow = 3) + + #ylim(0, 1500) + + xlab("Нуклеотид")+ + ylab("Количество делеций")+ + ggtitle("Состав делеций") + + theme(plot.title = element_text(hjust = 0.5)) +ggsave("../report/images_new_genome/delets.png", width = 2250, height = 1500, units = "px") +rm (delets2) + +#Number of insertions per sample + +inserts2 <- data.frame(matrix(ncol = 2, nrow = 0)) +colnames(inserts2)<- c("Nucleotide", "Sample") +for (i in samples_ecoli_chr) { + temp <- tp2_genome[tp2_genome$Sample==i,][,11] + temp <- unlist(strsplit(temp,"")) + temp<- temp[temp %in% c(letters, LETTERS)] + temp<-cbind("Nucleotide"=temp, "Sample" = i) + temp<-data.frame(temp) + temp$Nucleotide <- as.factor(temp$Nucleotide) + inserts2<-rbind(temp, inserts2) +} +ggplot(inserts2) + + xlab("Нуклеотид")+ + ylab("Количество вставок")+ + ggtitle("Состав вставок") + + geom_bar(aes(x = Nucleotide), width = 0.4, fill = "blue", alpha = 0.5, col = "black") + + facet_wrap(~Sample, nrow = 3) + + #ylim(0, 2000) + + theme(plot.title = element_text(hjust = 0.5)) +ggsave("../report/images_new_genome/inserts.png", width = 2250, height = 1500, units = "px") +rm (inserts2) + +#Mean, min and max values of insertions per sample + +insert_mean2 <- data.frame(matrix(ncol = 7, nrow = 0)) +colnames(insert_mean2)<- c("Sample", "Min.", "1st Qu.", "Median", "Mean", "3rd Qu.", "Max.") +for (i in samples_ecoli_chr) { + temp<-subset(tp2_genome, Mean_length_del!=0 & Sample == i)[,7] + temp<-cbind("Sample"=i, t(summary(temp))) + insert_mean2<-rbind(insert_mean2, temp) +} +insert_mean2[, 2]<-as.numeric(insert_mean2[, 2]) +insert_mean2[, 4]<-as.numeric(insert_mean2[, 4]) +insert_mean2[, 5]<-as.numeric(insert_mean2[, 5]) +insert_mean2[, 7]<-as.numeric(insert_mean2[, 7]) +ggplot(data.frame(insert_mean2), aes(x = factor(Sample), y = Mean), size = 0.5) + + geom_pointrange(aes(ymin = 0, ymax = Max.)) + + geom_point(aes(x = factor(Sample), y = 0), shape = 1) + + geom_point(aes(x = factor(Sample), y = Max.), shape = 1) + + xlab("Образец")+ + ylab("Количество нуклеотидов")+ + ggtitle("Статистика длины вставки") + + theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1)) +ggsave("../report/images_new_genome/iml.png", width = 2250, height = 1500, units = "px") +rm (insert_mean2) + +#Mean, min and max values of deletions per sample + +delete_mean2 <- data.frame(matrix(ncol = 7, nrow = 0)) +colnames(delete_mean2)<- c("Sample", "Min.", "1st Qu.", "Median", "Mean", "3rd Qu.", "Max.") +for (i in samples_ecoli_chr) { + temp<-subset(tp2_genome, Mean_length_del!=0 & Sample == i)[,6] + temp<-cbind("Sample"=i, t(summary(temp))) + delete_mean2<-rbind(delete_mean2, temp) +} +delete_mean2[, 2]<-as.numeric(delete_mean2[, 2]) +delete_mean2[, 4]<-as.numeric(delete_mean2[, 4]) +delete_mean2[, 5]<-as.numeric(delete_mean2[, 5]) +delete_mean2[, 7]<-as.numeric(delete_mean2[, 7]) +ggplot(data.frame(delete_mean2), aes(x = factor(Sample), y = Mean), size = 0.5) + + geom_pointrange(aes(ymin = 0, ymax = Max.)) + + geom_point(aes(x = factor(Sample), y = 0), shape = 1) + + geom_point(aes(x = factor(Sample), y = Max.), shape = 1) + + xlab("Образец")+ + ylab("Количество нуклеотидов")+ + ggtitle("Статистика длины делеции") + + theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1)) +ggsave("../report/images_new_genome/dml.png", width = 2250, height = 1500, units = "px") +rm(delete_mean2) + +#Number of mismatches per sample + +ggplot(tp2_genome, aes(x=Sample, y=Count_SNPs, group = 1)) + + stat_summary(fun = sum, geom="bar", width = 0.7, fill = "white", color = "black", alpha = 0.7) + + xlab("Образец")+ + ylab("Количество замен")+ + ggtitle("Количество однонуклеотидных замен") + + #geom_vline(xintercept = 50, linetype="dotted") + + theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1)) +ggsave("../report/images_new_genome/mps_mean.png", width = 2250, height = 1500, units = "px") + + +ggplot(tp2_genome, aes(x=Sample, y=Count_SNPs, group = 1)) + + stat_summary(fun = mean, geom="bar", width = 0.7, fill = "white", color = "black", alpha = 0.7) + + xlab("Образец")+ + ylab("Среднее число замен")+ + ggtitle("Количество однонуклеотидных замен на одно прочтение") + + #geom_vline(xintercept = 50, linetype="dotted") + + theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1)) +ggsave("../report/images_new_genome/mps.png", width = 2250, height = 1500, units = "px") + +#Number of deletions per cycle of sequence + +numbers_only <- function(x) !grepl("\\D", x) + +dels2 <- data.frame(matrix(nrow = 0, ncol = 2)) +colnames(dels2)<-c("Num", "Sample") +for (i in samples_ecoli_chr) { + temp <- tp2_genome[tp2_genome$Sample==i,][,10] + temp <- unlist(strsplit(temp, split="|",fixed = TRUE)) + temp <- unlist(strsplit(temp,",")) + temp<- temp[numbers_only(temp)] + temp<-cbind("Num"=temp, "Sample" = i) + temp<-data.frame(temp) + dels2<-rbind(temp, dels2) +} +dels2$Num<-as.numeric(dels2$Num) +ggplot(dels2, aes(x = Num)) + + #geom_density(stat = "count", alpha = 0.5) + + stat_count(geom="line", position="identity") + + stat_count(geom="point", position="identity", size = 1) + + facet_wrap(~Sample, nrow = 3) + + #ylim(0, 1500) + + xlab("Цикл прочтения, №")+ + ylab("Количество делеций")+ + ggtitle("Число делеций на цикл") + + theme_bw()+ + theme(plot.title = element_text(hjust = 0.5)) +ggsave("../report/images_new_genome/dpc.png", width = 2250, height = 1500, units = "px") +rm(dels2) + +#Number of insertions per cycle of sequence + +ins2 <- data.frame(matrix(nrow = 0, ncol = 2)) +colnames(ins2)<-c("Num", "Sample") +for (i in samples_ecoli_chr) { + temp <- tp2_genome[tp2_genome$Sample==i,][,11] + temp <- unlist(strsplit(temp, split="|",fixed = TRUE)) + temp <- unlist(strsplit(temp,",")) + temp<- temp[numbers_only(temp)] + temp<-cbind("Num"=temp, "Sample" = i) + temp<-data.frame(temp) + ins2<-rbind(temp, ins2) +} +ins2$Num<-as.numeric(ins2$Num) +ggplot(ins2, aes(x = Num)) + + stat_count(geom="line", position="identity") + + stat_count(geom="point", position="identity", size = 1) + + facet_wrap(~Sample, nrow = 3) + + #ylim(0, 1500) + + xlab("Цикл прочтения, №")+ + ylab("Количество вставок")+ + ggtitle("Число вставок на цикл") + + scale_x_continuous(breaks=seq(0, 150, 25))+ + theme_bw()+ + theme(plot.title = element_text(hjust = 0.5)) +ggsave("../report/images_new_genome/ipc.png", width = 2250, height = 1500, units = "px") +rm (ins2) + +#Number of deletions per sample + +parse_delets2 <- data.frame(matrix(nrow = 0, ncol = 2)) +colnames(parse_delets2)<-c("Num", "Sample") +for (i in samples_ecoli_chr) { + temp <- tp2_genome[tp2_genome$Sample==i,][,10] + temp <- unlist(strsplit(temp,split="|",fixed = TRUE)) + temp<-cbind("Num"=temp, "Sample" = i) + temp<-data.frame(temp) + parse_delets2<-rbind(temp, parse_delets2) +} +ggplot(parse_delets2, aes(x=Sample, group = 1)) + + stat_count(geom="bar", width = 0.7, fill = "white", color = "black", alpha = 0.7) + + xlab("Образец")+ + ylab("Количество делеций")+ + ggtitle("Число делеций на образец") + + #geom_vline(xintercept = 50, linetype="dotted") + + theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1)) +ggsave("../report/images_new_genome/dps.png", width = 2250, height = 1500, units = "px") +rm (parse_delets2) + +#Number of insertions per sample + +parse_inserts2 <- data.frame(matrix(nrow = 0, ncol = 2)) +colnames(parse_inserts2)<-c("Num", "Sample") +for (i in samples_ecoli_chr) { + temp <- tp2_genome[tp2_genome$Sample==i,][,11] + temp <- unlist(strsplit(temp,split="|",fixed = TRUE)) + temp<-cbind("Num"=temp, "Sample" = i) + temp<-data.frame(temp) + parse_inserts2<-rbind(temp, parse_inserts2) +} +ggplot(parse_inserts2, aes(x=Sample, group = 1)) + + stat_count(geom="bar", width = 0.7, fill = "white", color = "black", alpha = 0.7) + + xlab("Образец")+ + ylab("Количество вставок")+ + ggtitle("Число вставок на образец") + + #geom_vline(xintercept = 50, linetype="dotted") + + theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1)) +ggsave("../report/images_new_genome/ips.png", width = 2250, height = 1500, units = "px") +rm (parse_inserts2, temp) + +#Frequency of insertion motifs per sample + +insert_motifs2 <- data.frame(matrix(nrow = 0, ncol = 2)) +colnames(insert_motifs2)<-c("Num", "Sample") +for (i in samples_ecoli_chr) { + temp <- tp2_genome[tp2_genome$Sample==i,][,11] + temp <- unlist(strsplit(temp, split="|",fixed = TRUE)) + temp <- unlist(strsplit(temp,",")) + temp<- temp[!numbers_only(temp)] + temp<-cbind("Num"=temp, "Sample" = i) + temp<-data.frame(temp) + insert_motifs2<-rbind(temp, insert_motifs2) +} +ggplot(insert_motifs2, aes(x = Num)) + + stat_count(geom='bar', width = 0.7, fill = "white", color = "black", alpha = 0.7) + + facet_wrap(~Sample, nrow=4, scales = "free_y") + + xlab("Мотив")+ + ylab("Количество")+ + ggtitle("Представленность мотивов вставок в образцах") + + theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1)) +ggsave("../report/images_new_genome/imps.png", width = 10550, height = 2500, units = "px") +rm (insert_motifs2, temp) + +#Frequency of deletion motifs per sample + +delete_motifs2 <- data.frame(matrix(nrow = 0, ncol = 2)) +colnames(delete_motifs2)<-c("Num", "Sample") +for (i in samples_ecoli_chr) { + temp <- tp2_genome[tp2_genome$Sample==i,][,10] + temp <- unlist(strsplit(temp, split="|",fixed = TRUE)) + temp <- unlist(strsplit(temp,",")) + temp<- temp[!numbers_only(temp)] + temp<-cbind("Num"=temp, "Sample" = i) + temp<-data.frame(temp) + delete_motifs2<-rbind(temp, delete_motifs2) +} +ggplot(delete_motifs2, aes(x = Num)) + + stat_count(geom='bar', width = 0.7, fill = "white", color = "black", alpha = 0.7) + + facet_wrap(~Sample, nrow=4, scales = "free_y") + + xlab("Мотив")+ + ylab("Количество")+ + ggtitle("Представленность мотивов делеций в образцах") + + theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1)) +ggsave("../report/images_new_genome/dmps.png", width = 10550, height = 2500, units = "px") +rm (delete_motifs2, temp) \ No newline at end of file diff --git a/custom_lib_fastp_zvl_without_assembly.sh b/custom_lib_fastp_zvl_without_assembly.sh new file mode 100644 index 0000000..b327215 --- /dev/null +++ b/custom_lib_fastp_zvl_without_assembly.sh @@ -0,0 +1,46 @@ +#!/bin/bash +source /storage/data1/marmi/miniconda/installation/etc/profile.d/conda.sh +conda activate marmi_methylome + +#перед каждым запуском заполнять эти пять переменных и path_to_sample +input_file="libs_11_12_23.csv" +cell="V350165701" +path_to_run="/storage/data1/aug/V350165701_repair" +goal_dir="custom_libs_E_Coli_Zvl" +path_to_assembly="/storage/data1/marmi/dec_custom_libs" + +path_to_python="/storage/data1/marmi/trial_scripts/split_assembly.py" +path_to_python_chr="/storage/data1/marmi/trial_scripts/chromosome.py" +#выкидываем заголовок и заменяем пробелы на нижние подчеркивания +cp $input_file source_csv.csv +sed -i 's/ /_/g' source_csv.csv + +reads_fastp.txt +rm source_csv.csv + +cat reads_fastp.txt | while read i || [[ -n $i ]]; +do +dir=`echo $i| awk '{split($0, array); print array[1]}'` +if [ "$dir" == "$goal_dir" ]; then + fn1=`echo $i| awk '{split($0, array); print array[2]}'` + fn2=`echo $i| awk '{split($0, array); print array[3]}'` + sample=`echo $i| awk '{split($0, array); print array[4]}'` + lane=`echo $i| awk '{split($0, array); print array[5]}'` + path_to_sample="/storage/data1/marmi/dec_custom_libs_repaired_${sample}" + mkdir ${path_to_sample} + mkdir ${path_to_sample}/qc_source_files + cd ${path_to_sample}/qc_source_files + read1="${path_to_run}/L0${lane}/${fn1}" + read2="${path_to_run}/L0${lane}/${fn2}" + fastp -e 30 -w 7 --in1 ${read1} --in2 ${read2} \ + --out1 custom_${sample}_1.fastq.gz --out2 custom_${sample}_2.fastq.gz \ + --unpaired1 custom_${sample}_u.fastq.gz --unpaired2 custom_${sample}_u.fastq.gz \ + 1>fastp_custom_${sample}_output.txt 2>fastp_custom_${sample}_err.txt +fi +done +#НЕ УДАЛЯТЬ READS_FASTP.TXT + +conda deactivate +conda activate marmi_telegram +telegram-send "Work_custom_union_assembly first step ends at `date`." +conda deactivate \ No newline at end of file diff --git a/custom_lib_union_align_zvl.sh b/custom_lib_union_align_zvl.sh new file mode 100644 index 0000000..e4e834e --- /dev/null +++ b/custom_lib_union_align_zvl.sh @@ -0,0 +1,61 @@ +#!/bin/bash +source /storage/data1/marmi/miniconda/installation/etc/profile.d/conda.sh +conda activate marmi_methylome +#перед каждым запуском заполнять эти три переменных и path_to_sample +path_to_assembly="/storage/data1/marmi/dec_custom_libs_zvl/unicycler_assembly_custom" +goal_dir="custom_libs_E_Coli_Zvl" +path_to_tables="/storage/data1/marmi/dec_custom_libs_zvl/info_tables_repaired" +mkdir /storage/data1/marmi/dec_custom_libs_zvl +path_to_python="/storage/data1/marmi/trial_scripts/info_pol_mapq_union.py" +mkdir ${path_to_tables} +touch ${path_to_tables}/names_of_custom_chromosome.txt +touch ${path_to_tables}/names_of_custom_plasmid.txt +cat reads_fastp.txt | while read i || [[ -n $i ]]; +do +dir=`echo $i| awk '{split($0, array); print array[1]}'` +if [ "$dir" == "$goal_dir" ]; then + sample=`echo $i| awk '{split($0, array); print array[4]}'` + path_to_sample="/storage/data1/marmi/dec_custom_libs_repaired_${sample}" + name_of_dir="union_${sample}" + mkdir ${path_to_sample}/${name_of_dir} + cd ${path_to_sample}/${name_of_dir} + bwa mem -t 7 ${path_to_assembly}/assembly.fasta ../qc_source_files/custom_${sample}_1.fastq.gz \ + ../qc_source_files/custom_${sample}_2.fastq.gz >custom_${sample}_paired.sam \ + 2>bwa_mem_custom_${sample}_paired.log + bwa mem -t 7 ${path_to_assembly}/assembly.fasta ../qc_source_files/custom_${sample}_u.fastq.gz \ + >custom_${sample}_unpaired.sam \ + 2>bwa_mem_custom_${sample}_unpaired.log + samtools view -Sb -o custom_${sample}_paired.bam custom_${sample}_paired.sam + samtools view -Sb -o custom_${sample}_unpaired.bam custom_${sample}_unpaired.sam + rm custom_${sample}_paired.sam + rm custom_${sample}_unpaired.sam + samtools sort -o custom_${sample}_paired_sorted.bam custom_${sample}_paired.bam + samtools sort -o custom_${sample}_unpaired_sorted.bam custom_${sample}_unpaired.bam + samtools merge -o custom_${sample}.bam \ + custom_${sample}_paired_sorted.bam custom_${sample}_unpaired_sorted.bam + samtools index custom_${sample}.bam + rm custom_${sample}_paired_sorted.bam + rm custom_${sample}_unpaired_sorted.bam + rm custom_${sample}_paired.bam + rm custom_${sample}_unpaired.bam + touch custom_${sample}_genome.txt + touch custom_${sample}_plasmid.txt + cat ${path_to_assembly}/chromosome_genome.txt | while read j || [[ -n $j ]]; + do + samtools view -F 4 custom_${sample}.bam $j | grep 'MD:Z' >> custom_${sample}_genome.txt + done + cat ${path_to_assembly}/chromosome_plasmid.txt | while read j || [[ -n $j ]]; + do + samtools view -F 4 custom_${sample}.bam $j | grep 'MD:Z' >> custom_${sample}_plasmid.txt + done + python3 ${path_to_python} ${sample}_genome ${path_to_sample}/${name_of_dir}/ ${path_to_tables}/ + python3 ${path_to_python} ${sample}_plasmid ${path_to_sample}/${name_of_dir}/ ${path_to_tables}/ + echo "custom_${sample}_genome_mapq20_table.txt" >>${path_to_tables}/names_of_custom_chromosome.txt + echo "custom_${sample}_plasmid_mapq20_table.txt" >>${path_to_tables}/names_of_custom_plasmid.txt + +fi +done +conda deactivate +conda activate marmi_telegram +telegram-send "Work_custom_union_align second step ends at `date`." +conda deactivate diff --git a/info_pol_mapq_union.py b/info_pol_mapq_union.py new file mode 100644 index 0000000..ed6a7a9 --- /dev/null +++ b/info_pol_mapq_union.py @@ -0,0 +1,170 @@ +# -*- coding: utf-8 -*- + +import sys +import re + +def GC(buf): + gc = buf.upper().count('G') + buf.upper().count('C') + all = buf.upper().count('A') + buf.upper().count('T') + gc + return (gc / all * 100) + +def len_align(buf): + """ + нужно передавать CIGAR + """ + let = re.split('[0-9]+', buf) + let.remove("") + num = re.split('[MIDSH]{1}', buf) + num.remove("") + num = list(map(int, num)) + length = 0 + length_read = 0 + for i in range(len(let)): + if (let[i]=="M") or (let[i]=="I"): + length+=num[i] + length_read+=num[i] + elif ((let[i]=="S") or (let[i]=="H")): + length_read+=num[i] + return (length, length_read) + +def info (buf, arg = "I"): + """ + Передавать CIGAR + """ + let = re.split('[0-9]+', buf) + let.remove("") + num = re.split('[MIDSH]{1}', buf) + num.remove("") + num = list(map(int, num)) + count = 0 + for i in range(len(let)): + if (let[i]==arg): + count+=num[i] + return count + +def info_insert (buf): + """ + Передавать CIGAR, возращает позиции инсерции, включая левый символ и не включая правый символ + """ + let = re.split('[0-9]+', buf) + let.remove("") + num = re.split('[MIDSH]{1}', buf) + num.remove("") + num = list(map(int, num)) + start = 0 + intervals = [] + for i in range(len(let)): + if (let[i]=="I"): + intervals.append([start, start+num[i]]) + if (let[i]=="D") or (let[i]=="H"): + continue + start+=num[i] + return tuple(intervals) + +def removal (buf): + """ + Передавать CIGAR, возращает позиции удаления, включая левый символ и не включая правый символ + """ + let = re.split('[0-9]+', buf) + let.remove("") + num = re.split('[MIDSH]{1}', buf) + num.remove("") + num = list(map(int, num)) + start = 0 + intervals = [] + for i in range(len(let)): + if (let[i]=="I") or (let[i]=="S"): + intervals.append([start, start+num[i]]) + if (let[i]=="D") or (let[i]=="H"): + continue + start+=num[i] + return tuple(intervals) + +def type_snp_text (seq, buf, cigar): + positions_ins = info_insert(cigar) + inserts_text = '' + inserts = [] + for i in positions_ins: + inserts.append((seq[i[0]:i[1]], i[0])) + inserts_text = inserts_text + str(seq[i[0]:i[1]]) + "," + str(i[0]) + '|' + inserts_text = inserts_text[:-1] + gc_per = GC(seq) + positions = removal(cigar) + for i in positions: + if (i[1]<(len(seq))): + seq = seq[:i[0]] + "_"*(i[1]-i[0]) + seq[i[1]:] + else: + seq = seq[:i[0]] + "_"*(i[1]-i[0]) + seq = seq.replace("_", "") + buf = buf.replace("MD:Z:", "") + types = {"A>C":0, "A>G":0, "A>T":0, "C>A":0, "C>G":0, "C>T":0, "G>A":0, "G>C":0, "G>T":0, "T>A":0, "T>C":0, "T>G":0} + cycles_text = '' + dels_text = '' + cycles = [] + dels = [] + let = re.split('[0-9]+', buf) + let.pop(0) + num = re.split('[ATGC^]+', buf) + num = list(map(int, num)) + iter = 0 + for i in range(len(num)): + iter+=num[i] + if (let[i] == '') or (let[i] == '\n'): + continue + if (let[i][0] == '^'): + dels_text = dels_text + str(let[i][1:]) + "," + str(iter) + "|" + dels.append((let[i][1:], iter)) + continue + if (seq[iter]=="A") or (seq[iter]=="C") or (seq[iter]=="G") or (seq[iter]=="T"): + key = let[i] + ">" + seq[iter] + types[key]+=1 + cycles.append(iter) + cycles_text = cycles_text + str(iter) + "," + iter+=1 + else: + iter+=1 + l = len_align(cigar) + dels_text = dels_text[:-1] + cycles_text = cycles_text[:-1] + if len(dels)!=0: + len_del = sum([len(dels[i][0]) for i in range(len(dels))])/len(dels) + else: + len_del = 0 + if len(inserts)!=0: + len_ins = sum([len(inserts[i][0]) for i in range(len(inserts))])/len(inserts) + else: + len_ins = 0 + result = {"SNP": types, "Cycles": cycles_text, "Dels": dels_text, "Inserts" : inserts_text, "GC": gc_per, "Count_SNPs": sum(types.values()), "Mean_length_del": len_del, \ + "Mean_length_insert": len_ins, "Read_length": l[1], "Align_length": l[0]} + return result + + + +with open(sys.argv[3]+"custom_"+sys.argv[1]+"_mapq20_table.txt", 'w') as table: + header = "ID;Read;Count_SNPs;Cycles;GC;Mean_length_del;Mean_length_insert;Read_length;Align_length;Dels;Inserts;A>C;A>G;A>T;C>A;C>G;C>T;G>A;G>C;G>T;T>A;T>C;T>G\n" + table.write(header) + with open(sys.argv[2]+"custom_"+sys.argv[1]+".txt", 'r') as inp: + for line in inp: + columns = line.split("\t") + if int(columns[4])>20: + id = columns[0] + flag = columns[1] + binary = "{0:b}".format(int(flag)) + if (binary[-1]=='0'): + typ = 0 + elif len(binary)==7: + typ = 1 + elif len(binary)>=8: + if (binary[-7]=='0') and (binary[-8]=='1'): + typ = 2 + else: + typ = -1 + else: + typ = -1 + cigar = columns[5] + md = columns[12] + seq = columns[9] + inf = type_snp_text(seq, md, cigar) + stroke = id + ";" + str(typ) + ";" + str(inf['Count_SNPs']) + ";" + inf['Cycles'] + ";" + str(inf['GC']) + ";" + str(inf['Mean_length_del']) + ";" + str(inf['Mean_length_insert']) + ";" + str(inf['Read_length']) + ";" + str(inf['Align_length']) + ";" + inf['Dels'] + ";" + inf['Inserts'] + ";" + \ + str(inf['SNP']['A>C']) + ";" + str(inf['SNP']['A>G']) + ";" + str(inf['SNP']['A>T']) + ";" + str(inf['SNP']['C>A']) + ";" + str(inf['SNP']['C>G']) + ";" + str(inf['SNP']['C>T']) + ";" + str(inf['SNP']['G>A']) + ";" + str(inf['SNP']['G>C']) + ";" + str(inf['SNP']['G>T']) + ";" + str(inf['SNP']['T>A']) + ";" + str(inf['SNP']['T>C']) + ";" + str(inf['SNP']['T>G']) + '\n' + table.write(stroke) diff --git a/plasmid.R b/plasmid.R index b2fa63a..a57a6cf 100644 --- a/plasmid.R +++ b/plasmid.R @@ -8,29 +8,31 @@ library(stringr) #Uploading data in R temp_name<-read.table("names_of_custom_plasmid.txt") -samples <- c() +samples_ecoli <- c() for (i in temp_name[,c("V1")]) { - samples<-c(samples, gsub("_plasmid_mapq20_table.txt", "", gsub("custom_", "", i))) + samples_ecoli<-c(samples_ecoli, gsub("_plasmid_mapq20_table.txt", "", gsub("custom_", "", i))) } -tp2 <- data.frame(matrix(ncol = 24, nrow = 0)) -colnames(tp2) <- unlist(strsplit("ID;Read;Count_SNPs;Cycles;GC;Mean_length_del;Mean_length_insert;Read_length;Align_length;Dels;Inserts;A>C;A>G;A>T;C>A;C>G;C>T;G>A;G>C;G>T;T>A;T>C;T>G",";")) +tp2_plasmid <- data.frame(matrix(ncol = 24, nrow = 0)) +colnames(tp2_plasmid) <- unlist(strsplit("ID;Read;Count_SNPs;Cycles;GC;Mean_length_del;Mean_length_insert;Read_length;Align_length;Dels;Inserts;A>C;A>G;A>T;C>A;C>G;C>T;G>A;G>C;G>T;T>A;T>C;T>G",";")) for (i in temp_name[,c("V1")]) { temp<-read.csv(i, sep=";", header=TRUE) temp<-subset(temp, Read==2) - temp<-temp[sample(nrow(temp), 200000),] + temp<-temp[sample(nrow(temp), 100000),] temp<-cbind(temp, "Sample"=gsub("_plasmid_mapq20_table.txt", "", gsub("custom_", "", i))) - tp2<-rbind(tp2, temp) + tp2_plasmid<-rbind(tp2_plasmid, temp) } rm(temp, temp_name) -tp2$Sample<-as.factor(tp2$Sample) +tp2_plasmid$Sample<-as.factor(tp2_plasmid$Sample) dir.create("../report/images_new_plasmid") #GC-content -ggplot(data = tp2, aes(x = GC)) + +ggplot(data = tp2_plasmid, aes(x = GC)) + geom_density(stat = "density", fill = "pink", alpha = 0.5) + - facet_wrap(~Sample, nrow = 2) + + facet_wrap(~Sample, nrow = 3) + geom_vline(xintercept = 50, linetype="dotted") + + xlab("ГЦ-состав, %") + + ylab("Доля прочтений") + xlim(25, 75) + ylim(0, 0.15) + theme(plot.title = element_text(hjust = 0.5)) @@ -40,8 +42,8 @@ ggsave("../report/images_new_plasmid/gc.png", width = 2250, height = 1500, units df2<-data.frame(matrix(nrow = 0, ncol = 2)) colnames(df2)<-c("Cycles", "Sample") -for (i in samples) { - cyc <-subset(tp2, Sample == i)[,4] +for (i in samples_ecoli) { + cyc <-subset(tp2_plasmid, Sample == i)[,4] cyc <- as.numeric(unlist(strsplit(cyc,","))) df2<-rbind(df2, cbind("Cycles" = cyc, "Sample" =i)) } @@ -51,33 +53,41 @@ rm (cyc, i) ggplot(df2, aes(x = Cycles)) + geom_density(stat = "count", fill = "blue", alpha = 0.5) + #geom_line(stat = "count", fill = "blue", alpha = 0.5) + - facet_wrap(~Sample, nrow = 2) + - ggtitle("Mismatches per cycle") + + facet_wrap(~Sample, nrow = 3) + + xlab("Цикл прочтения, №")+ + ylab("Количество замен")+ + ggtitle("Количество однонуклеотидных замен по циклам") + theme(plot.title = element_text(hjust = 0.5)) ggsave("../report/images_new_plasmid/mpc.png", width = 2250, height = 1500, units = "px") ggplot(df2, aes(x = Cycles)) + geom_density(stat = "density", fill = "blue", alpha = 0.5) + - facet_wrap(~Sample, nrow = 2) + - ggtitle("Mismatches per cycle") + + facet_wrap(~Sample, nrow = 3) + + xlab("Цикл прочтения, №")+ + ylab("Доля от общего количества")+ + ggtitle("Количество однонуклеотидных замен по циклам") + theme(plot.title = element_text(hjust = 0.5)) ggsave("../report/images_new_plasmid/mpc_density.png", width = 2250, height = 1500, units = "px") rm (df2) #Mismatches depending on GC-content -mgc2<-cbind(tp2[, c("Count_SNPs", "Sample")], "GC" = round(tp2$GC)) +mgc2<-cbind(tp2_plasmid[, c("Count_SNPs", "Sample")], "GC" = round(tp2_plasmid$GC)) ggplot(mgc2, aes(x=GC, y=Count_SNPs)) + stat_summary(fun = sum, geom="line", linewidth = 0.5) + - facet_wrap(~Sample, nrow = 2) + - ggtitle("Mismatches/GC") + + facet_wrap(~Sample, nrow = 3) + + xlab("ГЦ-состав, %")+ + ylab("Количество однонуклеотидных замен")+ + ggtitle("Количество однонуклеотидных замен в зависимости от ГЦ-состава прочтений") + geom_vline(xintercept = 50, linetype="dotted") + theme_bw()+ theme(plot.title = element_text(hjust = 0.5)) ggsave("../report/images_new_plasmid/mgc.png", width = 2250, height = 1500, units = "px") ggplot(mgc2, aes(x=GC, y=Count_SNPs)) + stat_summary(fun = mean, geom="line", size = 0.5) + - facet_wrap(~Sample, nrow = 2) + - ggtitle("Mismatches/GC") + + facet_wrap(~Sample, nrow = 3) + + xlab("ГЦ-состав, %")+ + ylab("Среднее число однонуклеотидных замен на одно прочтение")+ + ggtitle("Количество однонуклеотидных замен в зависимости от ГЦ-состава прочтения") + geom_vline(xintercept = 50, linetype="dotted") + theme_bw()+ theme(plot.title = element_text(hjust = 0.5)) @@ -88,37 +98,41 @@ rm (mgc2) mtype2 <- data.frame(matrix(ncol = 3, nrow = 0)) colnames(mtype2)<- c("Sum", "type", "Sample") -for (i in samples) { - temp <- apply(tp2[, c(12:23)][tp2$Sample==i,], 2, sum) +for (i in samples_ecoli) { + temp <- apply(tp2_plasmid[, c(12:23)][tp2_plasmid$Sample==i,], 2, sum) temp <- data.frame("Sum" = temp) - temp$type<-as.factor(colnames(tp2[, c(12:23)])) + temp$type<-as.factor(colnames(tp2_plasmid[, c(12:23)])) temp<-cbind(temp, "Sample" = i) mtype2<-rbind(mtype2, temp) } ggplot(mtype2) + geom_line(aes(x = type, y = Sum, group = Sample)) + geom_point(aes(x = type, y = Sum)) + - #facet_wrap(~Sample, nrow = 2, scales = "free_y") + - facet_wrap(~Sample, nrow = 2, ) + - ggtitle("Mismatch type") + + #facet_wrap(~Sample, nrow = 3, scales = "free_y") + + facet_wrap(~Sample, nrow = 3, ) + + xlab("Тип однонуклеотидной замены")+ + ylab("Количество однонуклеотидных замен")+ + ggtitle("Частота типов однонуклеотидных замен") + theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1)) ggsave("../report/images_new_plasmid/mt.png", width = 2250, height = 1500, units = "px") mtype2_2 <- data.frame(matrix(ncol = 3, nrow = 0)) colnames(mtype2_2)<- c("mean", "type", "Sample") -for (i in samples) { - temp <- apply(tp2[tp2$Sample==i, c(12:23)], 2, mean) +for (i in samples_ecoli) { + temp <- apply(tp2_plasmid[tp2_plasmid$Sample==i, c(12:23)], 2, mean) temp <- data.frame("mean" = temp) - temp$type<-as.factor(colnames(tp2[, c(12:23)])) + temp$type<-as.factor(colnames(tp2_plasmid[, c(12:23)])) temp<-cbind(temp, "Sample" = i) mtype2_2<-rbind(mtype2_2, temp) } ggplot(mtype2_2) + geom_line(aes(x = type, y = mean, group = Sample)) + geom_point(aes(x = type, y = mean)) + - #facet_wrap(~Sample, nrow = 2, scales = "free_y") + - facet_wrap(~Sample, nrow = 2) + - ggtitle("Mismatch type") + + #facet_wrap(~Sample, nrow = 3, scales = "free_y") + + facet_wrap(~Sample, nrow = 3) + + xlab("Тип однонуклеотидной замены")+ + ylab("Среднее по прочтению")+ + ggtitle("Частота типов однонуклеотидных замен") + theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1)) ggsave("../report/images_new_plasmid/mt_mean.png", width = 2250, height = 1500, units = "px") rm(mtype2, mtype2_2) @@ -127,8 +141,8 @@ rm(mtype2, mtype2_2) delets2 <- data.frame(matrix(ncol = 2, nrow = 0)) colnames(delets2)<- c("Nucleotide", "Sample") -for (i in samples) { - temp <- tp2[tp2$Sample==i,][,10] +for (i in samples_ecoli) { + temp <- tp2_plasmid[tp2_plasmid$Sample==i,][,10] temp <- unlist(strsplit(temp,"")) temp<- temp[temp %in% c(letters, LETTERS)] temp<-cbind("Nucleotide"=temp, "Sample" = i) @@ -138,9 +152,11 @@ for (i in samples) { } ggplot(delets2) + geom_bar(aes(x = Nucleotide), width = 0.4, fill = "blue", alpha = 0.5, col = "black") + - facet_wrap(~Sample, nrow = 2) + + facet_wrap(~Sample, nrow = 3) + #ylim(0, 1500) + - ggtitle ("Deletions") + + xlab("Нуклеотид")+ + ylab("Количество делеций")+ + ggtitle("Состав делеций") + theme(plot.title = element_text(hjust = 0.5)) ggsave("../report/images_new_plasmid/delets.png", width = 2250, height = 1500, units = "px") rm (delets2) @@ -149,8 +165,8 @@ rm (delets2) inserts2 <- data.frame(matrix(ncol = 2, nrow = 0)) colnames(inserts2)<- c("Nucleotide", "Sample") -for (i in samples) { - temp <- tp2[tp2$Sample==i,][,11] +for (i in samples_ecoli) { + temp <- tp2_plasmid[tp2_plasmid$Sample==i,][,11] temp <- unlist(strsplit(temp,"")) temp<- temp[temp %in% c(letters, LETTERS)] temp<-cbind("Nucleotide"=temp, "Sample" = i) @@ -159,9 +175,11 @@ for (i in samples) { inserts2<-rbind(temp, inserts2) } ggplot(inserts2) + - ggtitle("Insertions") + + xlab("Нуклеотид")+ + ylab("Количество вставок")+ + ggtitle("Состав вставок") + geom_bar(aes(x = Nucleotide), width = 0.4, fill = "blue", alpha = 0.5, col = "black") + - facet_wrap(~Sample, nrow = 2) + + facet_wrap(~Sample, nrow = 3) + #ylim(0, 2000) + theme(plot.title = element_text(hjust = 0.5)) ggsave("../report/images_new_plasmid/inserts.png", width = 2250, height = 1500, units = "px") @@ -171,8 +189,8 @@ rm (inserts2) insert_mean2 <- data.frame(matrix(ncol = 7, nrow = 0)) colnames(insert_mean2)<- c("Sample", "Min.", "1st Qu.", "Median", "Mean", "3rd Qu.", "Max.") -for (i in samples) { - temp<-subset(tp2, Mean_length_del!=0 & Sample == i)[,7] +for (i in samples_ecoli) { + temp<-subset(tp2_plasmid, Mean_length_del!=0 & Sample == i)[,7] temp<-cbind("Sample"=i, t(summary(temp))) insert_mean2<-rbind(insert_mean2, temp) } @@ -182,12 +200,12 @@ insert_mean2[, 5]<-as.numeric(insert_mean2[, 5]) insert_mean2[, 7]<-as.numeric(insert_mean2[, 7]) ggplot(data.frame(insert_mean2), aes(x = factor(Sample), y = Mean), size = 0.5) + geom_pointrange(aes(ymin = 0, ymax = Max.)) + - scale_y_continuous(name = "Length") + geom_point(aes(x = factor(Sample), y = 0), shape = 1) + geom_point(aes(x = factor(Sample), y = Max.), shape = 1) + - ggtitle("Insertion mean length") + - scale_x_discrete(name = "Sample") + - theme(plot.title = element_text(hjust = 0.5)) + xlab("Образец")+ + ylab("Количество нуклеотидов")+ + ggtitle("Статистика длины вставки") + + theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1)) ggsave("../report/images_new_plasmid/iml.png", width = 2250, height = 1500, units = "px") rm (insert_mean2) @@ -195,8 +213,8 @@ rm (insert_mean2) delete_mean2 <- data.frame(matrix(ncol = 7, nrow = 0)) colnames(delete_mean2)<- c("Sample", "Min.", "1st Qu.", "Median", "Mean", "3rd Qu.", "Max.") -for (i in samples) { - temp<-subset(tp2, Mean_length_del!=0 & Sample == i)[,6] +for (i in samples_ecoli) { + temp<-subset(tp2_plasmid, Mean_length_del!=0 & Sample == i)[,6] temp<-cbind("Sample"=i, t(summary(temp))) delete_mean2<-rbind(delete_mean2, temp) } @@ -208,29 +226,32 @@ ggplot(data.frame(delete_mean2), aes(x = factor(Sample), y = Mean), size = 0.5) geom_pointrange(aes(ymin = 0, ymax = Max.)) + geom_point(aes(x = factor(Sample), y = 0), shape = 1) + geom_point(aes(x = factor(Sample), y = Max.), shape = 1) + - ggtitle("Deletion mean length") + - scale_x_discrete(name = "Sample") + - scale_y_continuous(name = "Length") + - theme(plot.title = element_text(hjust = 0.5)) + xlab("Образец")+ + ylab("Количество нуклеотидов")+ + ggtitle("Статистика длины делеции") + + theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1)) ggsave("../report/images_new_plasmid/dml.png", width = 2250, height = 1500, units = "px") rm(delete_mean2) #Number of mismatches per sample -ggplot(tp2, aes(x=Sample, y=Count_SNPs, group = 1)) + +ggplot(tp2_plasmid, aes(x=Sample, y=Count_SNPs, group = 1)) + stat_summary(fun = sum, geom="bar", width = 0.7, fill = "white", color = "black", alpha = 0.7) + - ggtitle("Mismatches per sample") + + xlab("Образец")+ + ylab("Количество замен")+ + ggtitle("Количество однонуклеотидных замен") + #geom_vline(xintercept = 50, linetype="dotted") + - theme(plot.title = element_text(hjust = 0.5)) + theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1)) ggsave("../report/images_new_plasmid/mps_mean.png", width = 2250, height = 1500, units = "px") -ggplot(tp2, aes(x=Sample, y=Count_SNPs, group = 1)) + - stat_summary(fun = mean, geom="point") + - stat_summary(fun = mean, geom="line") + - ggtitle("Mismatches per sample") + +ggplot(tp2_plasmid, aes(x=Sample, y=Count_SNPs, group = 1)) + + stat_summary(fun = mean, geom="bar", width = 0.7, fill = "white", color = "black", alpha = 0.7) + + xlab("Образец")+ + ylab("Среднее число замен")+ + ggtitle("Количество однонуклеотидных замен на одно прочтение") + #geom_vline(xintercept = 50, linetype="dotted") + - theme(plot.title = element_text(hjust = 0.5)) + theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1)) ggsave("../report/images_new_plasmid/mps.png", width = 2250, height = 1500, units = "px") #Number of deletions per cycle of sequence @@ -239,8 +260,8 @@ numbers_only <- function(x) !grepl("\\D", x) dels2 <- data.frame(matrix(nrow = 0, ncol = 2)) colnames(dels2)<-c("Num", "Sample") -for (i in samples) { - temp <- tp2[tp2$Sample==i,][,10] +for (i in samples_ecoli) { + temp <- tp2_plasmid[tp2_plasmid$Sample==i,][,10] temp <- unlist(strsplit(temp, split="|",fixed = TRUE)) temp <- unlist(strsplit(temp,",")) temp<- temp[numbers_only(temp)] @@ -253,9 +274,11 @@ ggplot(dels2, aes(x = Num)) + #geom_density(stat = "count", alpha = 0.5) + stat_count(geom="line", position="identity") + stat_count(geom="point", position="identity", size = 1) + - facet_wrap(~Sample, nrow = 2) + + facet_wrap(~Sample, nrow = 3) + #ylim(0, 1500) + - ggtitle ("Deletions per cycle") + + xlab("Цикл прочтения, №")+ + ylab("Количество делеций")+ + ggtitle("Число делеций на цикл") + theme_bw()+ theme(plot.title = element_text(hjust = 0.5)) ggsave("../report/images_new_plasmid/dpc.png", width = 2250, height = 1500, units = "px") @@ -265,8 +288,8 @@ rm(dels2) ins2 <- data.frame(matrix(nrow = 0, ncol = 2)) colnames(ins2)<-c("Num", "Sample") -for (i in samples) { - temp <- tp2[tp2$Sample==i,][,11] +for (i in samples_ecoli) { + temp <- tp2_plasmid[tp2_plasmid$Sample==i,][,11] temp <- unlist(strsplit(temp, split="|",fixed = TRUE)) temp <- unlist(strsplit(temp,",")) temp<- temp[numbers_only(temp)] @@ -278,9 +301,11 @@ ins2$Num<-as.numeric(ins2$Num) ggplot(ins2, aes(x = Num)) + stat_count(geom="line", position="identity") + stat_count(geom="point", position="identity", size = 1) + - facet_wrap(~Sample, nrow = 2, scales = "free_y") + + facet_wrap(~Sample, nrow = 3) + #ylim(0, 1500) + - ggtitle ("Insertions per cycle") + + xlab("Цикл прочтения, №")+ + ylab("Количество вставок")+ + ggtitle("Число вставок на цикл") + scale_x_continuous(breaks=seq(0, 150, 25))+ theme_bw()+ theme(plot.title = element_text(hjust = 0.5)) @@ -291,8 +316,8 @@ rm (ins2) parse_delets2 <- data.frame(matrix(nrow = 0, ncol = 2)) colnames(parse_delets2)<-c("Num", "Sample") -for (i in samples) { - temp <- tp2[tp2$Sample==i,][,10] +for (i in samples_ecoli) { + temp <- tp2_plasmid[tp2_plasmid$Sample==i,][,10] temp <- unlist(strsplit(temp,split="|",fixed = TRUE)) temp<-cbind("Num"=temp, "Sample" = i) temp<-data.frame(temp) @@ -300,9 +325,11 @@ for (i in samples) { } ggplot(parse_delets2, aes(x=Sample, group = 1)) + stat_count(geom="bar", width = 0.7, fill = "white", color = "black", alpha = 0.7) + - ggtitle("Deletions per sample") + + xlab("Образец")+ + ylab("Количество делеций")+ + ggtitle("Число делеций на образец") + #geom_vline(xintercept = 50, linetype="dotted") + - theme(plot.title = element_text(hjust = 0.5)) + theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1)) ggsave("../report/images_new_plasmid/dps.png", width = 2250, height = 1500, units = "px") rm (parse_delets2) @@ -310,8 +337,8 @@ rm (parse_delets2) parse_inserts2 <- data.frame(matrix(nrow = 0, ncol = 2)) colnames(parse_inserts2)<-c("Num", "Sample") -for (i in samples) { - temp <- tp2[tp2$Sample==i,][,11] +for (i in samples_ecoli) { + temp <- tp2_plasmid[tp2_plasmid$Sample==i,][,11] temp <- unlist(strsplit(temp,split="|",fixed = TRUE)) temp<-cbind("Num"=temp, "Sample" = i) temp<-data.frame(temp) @@ -319,9 +346,11 @@ for (i in samples) { } ggplot(parse_inserts2, aes(x=Sample, group = 1)) + stat_count(geom="bar", width = 0.7, fill = "white", color = "black", alpha = 0.7) + - ggtitle("Insertions per sample") + + xlab("Образец")+ + ylab("Количество вставок")+ + ggtitle("Число вставок на образец") + #geom_vline(xintercept = 50, linetype="dotted") + - theme(plot.title = element_text(hjust = 0.5)) + theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1)) ggsave("../report/images_new_plasmid/ips.png", width = 2250, height = 1500, units = "px") rm (parse_inserts2, temp) @@ -329,8 +358,8 @@ rm (parse_inserts2, temp) insert_motifs2 <- data.frame(matrix(nrow = 0, ncol = 2)) colnames(insert_motifs2)<-c("Num", "Sample") -for (i in samples) { - temp <- tp2[tp2$Sample==i,][,11] +for (i in samples_ecoli) { + temp <- tp2_plasmid[tp2_plasmid$Sample==i,][,11] temp <- unlist(strsplit(temp, split="|",fixed = TRUE)) temp <- unlist(strsplit(temp,",")) temp<- temp[!numbers_only(temp)] @@ -341,17 +370,19 @@ for (i in samples) { ggplot(insert_motifs2, aes(x = Num)) + stat_count(geom='bar', width = 0.7, fill = "white", color = "black", alpha = 0.7) + facet_wrap(~Sample, nrow=4, scales = "free_y") + - ggtitle("Insertion motifs per sample") + + xlab("Мотив")+ + ylab("Количество")+ + ggtitle("Представленность мотивов вставок в образцах") + theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1)) -ggsave("../report/images_new_plasmid/imps.png", width = 3250, height = 2500, units = "px") +ggsave("../report/images_new_plasmid/imps.png", width = 7550, height = 2500, units = "px") rm (insert_motifs2, temp) #Frequency of deletion motifs per sample delete_motifs2 <- data.frame(matrix(nrow = 0, ncol = 2)) colnames(delete_motifs2)<-c("Num", "Sample") -for (i in samples) { - temp <- tp2[tp2$Sample==i,][,10] +for (i in samples_ecoli) { + temp <- tp2_plasmid[tp2_plasmid$Sample==i,][,10] temp <- unlist(strsplit(temp, split="|",fixed = TRUE)) temp <- unlist(strsplit(temp,",")) temp<- temp[!numbers_only(temp)] @@ -362,7 +393,9 @@ for (i in samples) { ggplot(delete_motifs2, aes(x = Num)) + stat_count(geom='bar', width = 0.7, fill = "white", color = "black", alpha = 0.7) + facet_wrap(~Sample, nrow=4, scales = "free_y") + - ggtitle("Deletions motifs per sample") + + xlab("Мотив")+ + ylab("Количество")+ + ggtitle("Представленность мотивов делеций в образцах") + theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1)) -ggsave("../report/images_new_plasmid/dmps.png", width = 4550, height = 2500, units = "px") +ggsave("../report/images_new_plasmid/dmps.png", width = 7550, height = 2500, units = "px") rm (delete_motifs2, temp) \ No newline at end of file diff --git a/qr-code.png b/qr-code.png new file mode 100644 index 0000000..71ad61a Binary files /dev/null and b/qr-code.png differ