diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..26d3352
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,3 @@
+# Default ignored files
+/shelf/
+/workspace.xml
diff --git a/.idea/diy_protocols.iml b/.idea/diy_protocols.iml
new file mode 100644
index 0000000..d0876a7
--- /dev/null
+++ b/.idea/diy_protocols.iml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 0000000..105ce2d
--- /dev/null
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..a971a2c
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..e902a04
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..94a25f7
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/bacterial_genome.R b/bacterial_genome.R
new file mode 100644
index 0000000..9039270
--- /dev/null
+++ b/bacterial_genome.R
@@ -0,0 +1,410 @@
+library(ggplot2)
+library(stringr)
+#####
+#This part of scripts produces the plots of alignment data in a directory "../report/images_assembly/" with format ".png". Only genome-aligned reverse reads were taken here.
+#For every sample was created a set of reads of equal capacity (min between all sample reads, estimated manually).
+#####
+
+#Uploading data in R
+
+temp<-read.csv("custom_1_mapq_table.txt", sep=";", header=TRUE)
+temp<-subset(temp, Read==2)
+temp<-temp[sample(nrow(temp), 190000),]
+temp<-cbind(temp, "Sample"=1)
+table_set<-temp
+samples<-c(2, 4:7, 9)
+for (i in samples) {
+ filename<-paste("custom_", i, "_mapq_table.txt", sep='')
+ temp<-read.csv(filename, sep=";", header=TRUE)
+ temp<-subset(temp, Read==2)
+ temp<-temp[sample(nrow(temp), 190000),]
+ temp<-cbind(temp, "Sample"=i)
+ table_set<-rbind(table_set, temp)
+}
+rm(temp)
+table_set$Sample<-as.factor(table_set$Sample)
+
+#GC-content
+
+ggplot(data = table_set, aes(x = GC)) +
+ geom_density(stat = "density", fill = "pink", alpha = 0.5) +
+ facet_wrap(~Sample, nrow = 3) +
+ geom_vline(xintercept = 50, linetype="dotted") +
+ theme(plot.title = element_text(hjust = 0.5))
+ggsave("../report/images_assembly/gc.png", width = 2250, height = 1500, units = "px")
+
+#Mismatches per cycle of sequence
+
+df2<-data.frame("Cycles"=0, "Sample"=1)
+for (i in c(1, samples)) {
+ cyc <-subset(table_set, Sample == i)[,4]
+ cyc <- as.numeric(unlist(strsplit(cyc,",")))
+ df2<-rbind(df2, cbind("Cycles" = cyc, "Sample" =i))
+}
+df2<-df2[-1,]
+df2$Sample<-as.factor(df2$Sample)
+rm (cyc, i)
+
+ggplot(df2, aes(x = Cycles)) +
+ geom_density(stat = "count", fill = "blue", alpha = 0.5) +
+ #geom_line(stat = "count", fill = "blue", alpha = 0.5) +
+ facet_wrap(~Sample, nrow = 3) +
+ ggtitle("Mismatches per cycle") +
+ theme(plot.title = element_text(hjust = 0.5))
+ggsave("../report/images_assembly/mpc.png", width = 2250, height = 1500, units = "px")
+ggplot(df2, aes(x = Cycles)) +
+ geom_density(stat = "density", fill = "blue", alpha = 0.5) +
+ facet_wrap(~Sample, nrow = 3) +
+ ggtitle("Mismatches per cycle") +
+ theme(plot.title = element_text(hjust = 0.5))
+ggsave("../report/images_assembly/mpc_density.png", width = 2250, height = 1500, units = "px")
+rm (df2)
+
+#Mismatches depending on GC-content
+
+mgc2<-cbind(table_set[, c("Count_SNPs", "Sample")], "GC" = round(table_set$GC))
+
+ggplot(mgc2, aes(x=GC, y=Count_SNPs)) +
+ stat_summary(fun = sum, geom="line", linewidth = 0.5) +
+ facet_wrap(~Sample, nrow = 3) +
+ ggtitle("Mismatches/GC") +
+ geom_vline(xintercept = 50, linetype="dotted") +
+ theme_bw()+
+ theme(plot.title = element_text(hjust = 0.5))
+ggsave("../report/images_assembly/mgc.png", width = 2250, height = 1500, units = "px")
+ggplot(mgc2, aes(x=GC, y=Count_SNPs)) +
+ stat_summary(fun = mean, geom="line", size = 0.5) +
+ facet_wrap(~Sample, nrow = 3) +
+ ggtitle("Mismatches/GC") +
+ geom_vline(xintercept = 50, linetype="dotted") +
+ theme_bw()+
+ theme(plot.title = element_text(hjust = 0.5))
+ggsave("../report/images_assembly/mgc_mean.png", width = 2250, height = 1500, units = "px")
+
+rm (mgc2)
+
+#Frequency of different types of mismatches
+
+mtype2 <- apply(table_set[, c(12:23)][table_set$Sample==1,], 2, sum)
+mtype2 <- data.frame("Sum" = mtype2)
+mtype2$type<-as.factor(colnames(table_set[, c(12:23)]))
+mtype2<-cbind(mtype2, "Sample" = 1)
+
+for (i in samples) {
+ temp <- apply(table_set[, c(12:23)][table_set$Sample==i,], 2, sum)
+ temp <- data.frame("Sum" = temp)
+ temp$type<-as.factor(colnames(table_set[, c(12:23)]))
+ temp<-cbind(temp, "Sample" = i)
+ mtype2<-rbind(mtype2, temp)
+}
+rm(temp)
+
+ggplot(mtype2) +
+ geom_line(aes(x = type, y = Sum, group = Sample)) +
+ geom_point(aes(x = type, y = Sum)) +
+ #facet_wrap(~Sample, nrow = 3, scales = "free_y") +
+ facet_wrap(~Sample, nrow = 3, ) +
+ ggtitle("Mismatch type") +
+ theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1))
+ggsave("../report/images_assembly/mt.png", width = 2250, height = 1500, units = "px")
+
+mtype2_2 <- apply(table_set[table_set$Sample==1, c(12:23)], 2, mean)
+mtype2_2 <- data.frame("mean" = mtype2_2)
+mtype2_2$type<-as.factor(colnames(table_set[, c(12:23)]))
+mtype2_2<-cbind(mtype2_2, "Sample" = 1)
+for (i in samples) {
+ temp <- apply(table_set[table_set$Sample==i, c(12:23)], 2, mean)
+ temp <- data.frame("mean" = temp)
+ temp$type<-as.factor(colnames(table_set[, c(12:23)]))
+ temp<-cbind(temp, "Sample" = i)
+ mtype2_2<-rbind(mtype2_2, temp)
+}
+mtype2_2$Sample<-as.factor(mtype2_2$Sample)
+
+ggplot(mtype2_2) +
+ geom_line(aes(x = type, y = mean, group = Sample)) +
+ geom_point(aes(x = type, y = mean)) +
+ #facet_wrap(~Sample, nrow = 3, scales = "free_y") +
+ facet_wrap(~Sample, nrow = 3) +
+ ggtitle("Mismatch type") +
+ theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1))
+ggsave("../report/images_assembly/mt_mean.png", width = 2250, height = 1500, units = "px")
+
+rm (mtype2, mtype2_2)
+
+#Number of deletions per sample
+
+delets2 <- table_set[table_set$Sample==1,][,10]
+delets2 <- unlist(strsplit(delets2,""))
+delets2<- delets2[delets2 %in% c(letters, LETTERS)]
+delets2<-cbind("Nucleotide"=delets2, "Sample" = 1)
+delets2<-data.frame(delets2)
+delets2$Nucleotide <- as.factor(delets2$Nucleotide)
+for (i in samples) {
+ temp <- table_set[table_set$Sample==i,][,10]
+ temp <- unlist(strsplit(temp,""))
+ temp<- temp[temp %in% c(letters, LETTERS)]
+ temp<-cbind("Nucleotide"=temp, "Sample" = i)
+ temp<-data.frame(temp)
+ temp$Nucleotide <- as.factor(temp$Nucleotide)
+ delets2<-rbind(temp, delets2)
+}
+delets2$Sample<-as.factor(delets2$Sample)
+rm (temp)
+
+ggplot(delets2) +
+ geom_bar(aes(x = Nucleotide), width = 0.25, fill = "orange") +
+ facet_wrap(~Sample, nrow = 3, scales = "free_y") +
+ #ylim(0, 1500) +
+ ggtitle ("Deletions") +
+ theme(plot.title = element_text(hjust = 0.5))
+ggsave("../report/images_assembly/delets.png", width = 2250, height = 1500, units = "px")
+
+rm (delets2)
+
+#Number of insertions per sample
+
+inserts2 <- table_set[table_set$Sample==1,][,11]
+inserts2 <- unlist(strsplit(inserts2,""))
+inserts2<- inserts2[inserts2 %in% c(letters, LETTERS)]
+inserts2<-cbind("Nucleotide"=inserts2, "Sample" = 1)
+inserts2<-data.frame(inserts2)
+inserts2$Nucleotide <- as.factor(inserts2$Nucleotide)
+for (i in samples) {
+ temp <- table_set[table_set$Sample==i,][,11]
+ temp <- unlist(strsplit(temp,""))
+ temp<- temp[temp %in% c(letters, LETTERS)]
+ temp<-cbind("Nucleotide"=temp, "Sample" = i)
+ temp<-data.frame(temp)
+ temp$Nucleotide <- as.factor(temp$Nucleotide)
+ inserts2<-rbind(temp, inserts2)
+}
+inserts2$Sample<-as.factor(inserts2$Sample)
+
+ggplot(inserts2) +
+ ggtitle("Insertions") +
+ geom_bar(aes(x = Nucleotide), width = 0.25, fill = "orange") +
+ facet_wrap(~Sample, nrow = 2, scales = "free_y") +
+ #ylim(0, 2000) +
+ theme(plot.title = element_text(hjust = 0.5))
+ggsave("../report/images_assembly/inserts.png", width = 2250, height = 1500, units = "px")
+
+rm (inserts2)
+
+#Mean, min and max values of insertions per sample
+
+insert_mean2<-subset(table_set, Sample == 1)[,7]
+insert_mean2<-cbind("Sample"=1, t(summary(insert_mean2)))
+for (i in samples) {
+ temp<-subset(table_set, Mean_length_insert!=0 & Sample == i)[,7]
+ temp<-cbind("Sample"=i, t(summary(temp)))
+ insert_mean2<-rbind(insert_mean2, temp)
+}
+
+ggplot(data.frame(insert_mean2), aes(x = factor(Sample), y = Mean), size = 0.5) +
+ geom_pointrange(aes(ymin = 0, ymax = Max.)) +
+ #scale_y_continuous(breaks = c(0:12), limits = c(0, 13)) +
+ geom_point(aes(x = factor(Sample), y = 0), shape = 1) +
+ geom_point(aes(x = factor(Sample), y = Max.), shape = 1) +
+ ggtitle("Insert mean length") +
+ scale_x_discrete(name = "Sample") +
+ #scale_y_continuous(name = "Length", breaks = seq(0, 18, 1)) +
+ theme(plot.title = element_text(hjust = 0.5))
+ggsave("../report/images_assembly/iml.png", width = 2250, height = 1500, units = "px")
+
+rm (insert_mean2)
+
+#Mean, min and max values of deletions per sample
+
+delete_mean2<-subset(table_set, Sample == 1)[,6]
+delete_mean2<-cbind("Sample"=1, t(summary(delete_mean2)))
+for (i in samples) {
+ temp<-subset(table_set, Mean_length_del!=0 & Sample == i)[,6]
+ temp<-cbind("Sample"=i, t(summary(temp)))
+ delete_mean2<-rbind(delete_mean2, temp)
+}
+ggplot(data.frame(delete_mean2), aes(x = factor(Sample), y = Mean), size = 0.5) +
+ geom_pointrange(aes(ymin = 0, ymax = Max.)) +
+ #scale_y_continuous(breaks = c(0:22), limits = c(0, 22)) +
+ geom_point(aes(x = factor(Sample), y = 0), shape = 1) +
+ geom_point(aes(x = factor(Sample), y = Max.), shape = 1) +
+ ggtitle("Delete mean length") +
+ scale_x_discrete(name = "Sample") +
+ #scale_y_continuous(name = "Length", breaks = seq(0, 12, 1)) +
+ theme(plot.title = element_text(hjust = 0.5))
+ggsave("../report/images_assembly/dml.png", width = 2250, height = 1500, units = "px")
+
+rm (delete_mean2)
+
+#Number of mismatches per sample
+
+ggplot(table_set, aes(x=Sample, y=Count_SNPs, group = 1)) +
+ stat_summary(fun = sum, geom="bar", width = 0.7, fill = "white", color = "black", alpha = 0.7) +
+ ggtitle("Mismatches per sample") +
+ #geom_vline(xintercept = 50, linetype="dotted") +
+ theme(plot.title = element_text(hjust = 0.5))
+ggsave("../report/images_assembly/mps_mean.png", width = 2250, height = 1500, units = "px")
+
+
+ggplot(table_set, aes(x=Sample, y=Count_SNPs, group = 1)) +
+ stat_summary(fun = mean, geom="point") +
+ stat_summary(fun = mean, geom="line") +
+ ggtitle("Mismatches per sample") +
+ #geom_vline(xintercept = 50, linetype="dotted") +
+ theme(plot.title = element_text(hjust = 0.5))
+ggsave("../report/images_assembly/mps.png", width = 2250, height = 1500, units = "px")
+
+#Number of deletions per cycle of sequence
+
+numbers_only <- function(x) !grepl("\\D", x)
+
+dels2 <- table_set[table_set$Sample==1,][,10]
+dels2 <- unlist(strsplit(dels2, split="|",fixed = TRUE))
+dels2 <- unlist(strsplit(dels2,","))
+dels2<- dels2[numbers_only(dels2)]
+dels2<-cbind("Num"=dels2, "Sample" = 1)
+dels2<-data.frame(dels2)
+for (i in samples) {
+ temp <- table_set[table_set$Sample==i,][,10]
+ temp <- unlist(strsplit(temp, split="|",fixed = TRUE))
+ temp <- unlist(strsplit(temp,","))
+ temp<- temp[numbers_only(temp)]
+ temp<-cbind("Num"=temp, "Sample" = i)
+ temp<-data.frame(temp)
+ dels2<-rbind(temp, dels2)
+}
+dels2$Num<-as.numeric(dels2$Num)
+ggplot(dels2, aes(x = Num)) +
+ #geom_density(stat = "count", alpha = 0.5) +
+ stat_count(geom="line", position="identity") +
+ stat_count(geom="point", position="identity", size = 1) +
+ facet_wrap(~Sample, nrow = 3, scales = "free_y") +
+ #ylim(0, 1500) +
+ ggtitle ("Deletions per cycle") +
+ theme_bw()+
+ theme(plot.title = element_text(hjust = 0.5))
+ggsave("../report/images_assembly/dpc.png", width = 2250, height = 1500, units = "px")
+
+rm (dels2, temp)
+
+#Number of insertions per cycle of sequence
+
+ins2 <- table_set[table_set$Sample==1,][,11]
+ins2 <- unlist(strsplit(ins2, split="|",fixed = TRUE))
+ins2 <- unlist(strsplit(ins2,","))
+ins2<- ins2[numbers_only(ins2)]
+ins2<-cbind("Num"=ins2, "Sample" = 1)
+ins2<-data.frame(ins2)
+for (i in samples) {
+ temp <- table_set[table_set$Sample==i,][,11]
+ temp <- unlist(strsplit(temp, split="|",fixed = TRUE))
+ temp <- unlist(strsplit(temp,","))
+ temp<- temp[numbers_only(temp)]
+ temp<-cbind("Num"=temp, "Sample" = i)
+ temp<-data.frame(temp)
+ ins2<-rbind(temp, ins2)
+}
+ins2$Num<-as.numeric(ins2$Num)
+ggplot(ins2, aes(x = Num)) +
+ stat_count(geom="line", position="identity") +
+ stat_count(geom="point", position="identity", size = 1) +
+ facet_wrap(~Sample, nrow = 3, scales = "free_y") +
+ #ylim(0, 1500) +
+ ggtitle ("Insertions per cycle") +
+ scale_x_continuous(breaks=seq(0, 150, 25))+
+ theme_bw()+
+ theme(plot.title = element_text(hjust = 0.5))
+ggsave("../report/images_assembly/ipc.png", width = 2250, height = 1500, units = "px")
+rm (ins2, temp)
+
+#Number of deletions per sample
+
+parse_delets2 <- table_set[table_set$Sample==1,][,10]
+parse_delets2 <- unlist(strsplit(parse_delets2, split="|",fixed = TRUE))
+parse_delets2<-cbind("Num"=parse_delets2, "Sample" = 1)
+parse_delets2<-data.frame(parse_delets2)
+for (i in samples) {
+ temp <- table_set[table_set$Sample==i,][,10]
+ temp <- unlist(strsplit(temp,split="|",fixed = TRUE))
+ temp<-cbind("Num"=temp, "Sample" = i)
+ temp<-data.frame(temp)
+ parse_delets2<-rbind(temp, parse_delets2)
+}
+ggplot(parse_delets2, aes(x=Sample, group = 1)) +
+ stat_count(geom="bar", width = 0.7, fill = "white", color = "black", alpha = 0.7) +
+ ggtitle("Deletions per sample") +
+ #geom_vline(xintercept = 50, linetype="dotted") +
+ theme(plot.title = element_text(hjust = 0.5))
+ggsave("../report/images_assembly/dps.png", width = 2250, height = 1500, units = "px")
+rm (parse_delets2, temp)
+
+#Number of insertions per sample
+
+parse_inserts2 <- table_set[table_set$Sample==1,][,11]
+parse_inserts2 <- unlist(strsplit(parse_inserts2, split="|",fixed = TRUE))
+parse_inserts2<-cbind("Num"=parse_inserts2, "Sample" = 1)
+parse_inserts2<-data.frame(parse_inserts2)
+for (i in samples) {
+ temp <- table_set[table_set$Sample==i,][,11]
+ temp <- unlist(strsplit(temp,split="|",fixed = TRUE))
+ temp<-cbind("Num"=temp, "Sample" = i)
+ temp<-data.frame(temp)
+ parse_inserts2<-rbind(temp, parse_inserts2)
+}
+ggplot(parse_inserts2, aes(x=Sample, group = 1)) +
+ stat_count(geom="bar", width = 0.7, fill = "white", color = "black", alpha = 0.7) +
+ ggtitle("Insertions per sample") +
+ #geom_vline(xintercept = 50, linetype="dotted") +
+ theme(plot.title = element_text(hjust = 0.5))
+ggsave("../report/images_assembly/ips.png", width = 2250, height = 1500, units = "px")
+rm (parse_inserts2, temp)
+
+#Frequency of insertion motifs per sample
+
+insert_motifs2 <- table_set[table_set$Sample==1,][,11]
+insert_motifs2 <- unlist(strsplit(insert_motifs2, split="|",fixed = TRUE))
+insert_motifs2 <- unlist(strsplit(insert_motifs2,","))
+insert_motifs2 <- insert_motifs2[!numbers_only(insert_motifs2)]
+insert_motifs2 <- cbind("Num"=insert_motifs2, "Sample" = 1)
+insert_motifs2 <- data.frame(insert_motifs2)
+for (i in samples) {
+ temp <- table_set[table_set$Sample==i,][,11]
+ temp <- unlist(strsplit(temp, split="|",fixed = TRUE))
+ temp <- unlist(strsplit(temp,","))
+ temp <- temp[!numbers_only(temp)]
+ temp <- cbind("Num"=temp, "Sample" = i)
+ temp <- data.frame(temp)
+ insert_motifs2 <- rbind(temp, insert_motifs2)
+}
+ggplot(insert_motifs2, aes(x = Num)) +
+ stat_count(geom='bar', width = 0.7, fill = "white", color = "black", alpha = 0.7) +
+ facet_wrap(~Sample, nrow=4, scales = "free_y") +
+ ggtitle("Insertion motifs per sample") +
+ theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1))
+ggsave("../report/images_assembly/imps.png", width = 2250, height = 2500, units = "px")
+rm (temp, insert_motifs2)
+
+#Frequency of deletion motifs per sample
+
+delete_motifs2 <- table_set[table_set$Sample==1,][,10]
+delete_motifs2 <- unlist(strsplit(delete_motifs2, split="|",fixed = TRUE))
+delete_motifs2 <- unlist(strsplit(delete_motifs2,","))
+delete_motifs2 <- delete_motifs2[!numbers_only(delete_motifs2)]
+delete_motifs2 <- cbind("Num"=delete_motifs2, "Sample" = 1)
+delete_motifs2 <- data.frame(delete_motifs2)
+for (i in samples) {
+ temp <- table_set[table_set$Sample==i,][,10]
+ temp <- unlist(strsplit(temp, split="|",fixed = TRUE))
+ temp <- unlist(strsplit(temp,","))
+ temp <- temp[!numbers_only(temp)]
+ temp <- cbind("Num"=temp, "Sample" = i)
+ temp <- data.frame(temp)
+ delete_motifs2 <- rbind(temp, delete_motifs2)
+}
+ggplot(delete_motifs2, aes(x = Num)) +
+ stat_count(geom='bar', width = 0.7, fill = "white", color = "black", alpha = 0.7) +
+ facet_wrap(~Sample, nrow=7, scales = "free_y") +
+ ggtitle("Deletions motifs per sample") +
+ theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1))
+ggsave("../report/images_assembly/dmps.png", width = 2250, height = 3500, units = "px")
+rm (delete_motifs2, temp)
\ No newline at end of file
diff --git a/plasmid.R b/plasmid.R
new file mode 100644
index 0000000..e8b68e2
--- /dev/null
+++ b/plasmid.R
@@ -0,0 +1,403 @@
+library(ggplot2)
+library(stringr)
+#####
+#This part of scripts produces the plots of alignment data. Only plasmid-aligned reverse reads were taken here.
+#For every sample was created a set of reads of equal capacity (min between all sample reads).
+#####
+
+#Uploading data in R
+
+t1<-read.csv("../custom_1_90_mapq.txt", sep=";")
+t2<-read.csv("../custom_2_87_mapq.txt", sep=";")
+t4<-read.csv("../custom_4_96_mapq.txt", sep=";")
+t5<-read.csv("../custom_5_92_mapq.txt", sep=";")
+t6<-read.csv("../custom_6_108_mapq.txt", sep=";")
+t7<-read.csv("../custom_7_97_mapq.txt", sep=";")
+tpol<-read.csv("../custom_pol_93_mapq.txt", sep=";")
+
+tp<-cbind(t1, "Sample" =1)
+tp<-rbind(tp, cbind(t2, "Sample" =2))
+tp<-rbind(tp, cbind(t4, "Sample" =4))
+tp<-rbind(tp, cbind(t5, "Sample" =5))
+tp<-rbind(tp, cbind(t6, "Sample" =6))
+tp<-rbind(tp, cbind(t7, "Sample" =7))
+tp<-rbind(tp, cbind(tpol, "Sample" =9))
+
+samples<-c(2, 4:7, 9)
+
+sample2<-subset(tp, Read==2&Sample==1)
+tp2<-sample2[sample(nrow(sample2), 20000),]
+for (i in samples) {
+ sample2<-subset(tp, Read==2&Sample==i)
+ tp2<-rbind(tp2, sample2[sample(nrow(sample2), 20000),])
+}
+rm(sample2, t1, t2, t4, t5, t6, t7, tpol)
+
+#GC-content
+
+ggplot(data = tp2, aes(x = GC)) +
+ geom_density(stat = "density", fill = "pink", alpha = 0.5) +
+ facet_wrap(~Sample, nrow = 3) +
+ geom_vline(xintercept = 50, linetype="dotted") +
+ xlim(25, 75) +
+ ylim(0, 0.15) +
+ theme(plot.title = element_text(hjust = 0.5))
+ggsave("../report/images_read2/gc.png", width = 2250, height = 1500, units = "px")
+
+#Mismatches per cycle of sequence
+
+df2<-data.frame("Cycles"=0, "Sample"=1)
+for (i in c(1:2, 4:7, 9)) {
+ cyc <-subset(tp2, Sample == i)[,4]
+ cyc <- as.numeric(unlist(strsplit(cyc,",")))
+ df2<-rbind(df2, cbind("Cycles" = cyc, "Sample" =i))
+}
+df2<-df2[-1,]
+df2$Sample<-as.factor(df2$Sample)
+rm (cyc, i)
+ggplot(df2, aes(x = Cycles)) +
+ geom_density(stat = "count", fill = "blue", alpha = 0.5) +
+ #geom_line(stat = "count", fill = "blue", alpha = 0.5) +
+ facet_wrap(~Sample, nrow = 3) +
+ ggtitle("Mismatches per cycle") +
+ theme(plot.title = element_text(hjust = 0.5))
+ggsave("../report/images_read2/mpc.png", width = 2250, height = 1500, units = "px")
+ggplot(df2, aes(x = Cycles)) +
+ geom_density(stat = "density", fill = "blue", alpha = 0.5) +
+ facet_wrap(~Sample, nrow = 3) +
+ ggtitle("Mismatches per cycle") +
+ theme(plot.title = element_text(hjust = 0.5))
+ggsave("../report/images_read2/mpc_density.png", width = 2250, height = 1500, units = "px")
+rm (df2)
+
+#Mismatches depending on GC-content
+
+mgc2<-cbind(tp2[, c("Count_SNPs", "Sample")], "GC" = round(tp2$GC))
+ggplot(mgc2, aes(x=GC, y=Count_SNPs)) +
+ stat_summary(fun = sum, geom="line", linewidth = 0.5) +
+ facet_wrap(~Sample, nrow = 3) +
+ ggtitle("Mismatches/GC") +
+ geom_vline(xintercept = 50, linetype="dotted") +
+ theme_bw()+
+ theme(plot.title = element_text(hjust = 0.5))
+ggsave("../report/images_read2/mgc.png", width = 2250, height = 1500, units = "px")
+ggplot(mgc2, aes(x=GC, y=Count_SNPs)) +
+ stat_summary(fun = mean, geom="line", size = 0.5) +
+ facet_wrap(~Sample, nrow = 3) +
+ ggtitle("Mismatches/GC") +
+ geom_vline(xintercept = 50, linetype="dotted") +
+ theme_bw()+
+ theme(plot.title = element_text(hjust = 0.5))
+ggsave("../report/images_read2/mgc_mean.png", width = 2250, height = 1500, units = "px")
+rm (mgc2)
+
+#Frequency of different types of mismatches
+
+mtype2 <- apply(tp2[, c(12:23)][tp2$Sample==1,], 2, sum)
+mtype2 <- data.frame("Sum" = mtype2)
+mtype2$type<-as.factor(colnames(tp2[, c(12:23)]))
+mtype2<-cbind(mtype2, "Sample" = 1)
+for (i in samples) {
+ temp <- apply(tp2[, c(12:23)][tp2$Sample==i,], 2, sum)
+ temp <- data.frame("Sum" = temp)
+ temp$type<-as.factor(colnames(tp2[, c(12:23)]))
+ temp<-cbind(temp, "Sample" = i)
+ mtype2<-rbind(mtype2, temp)
+}
+ggplot(mtype2) +
+ geom_line(aes(x = type, y = Sum, group = Sample)) +
+ geom_point(aes(x = type, y = Sum)) +
+ #facet_wrap(~Sample, nrow = 3, scales = "free_y") +
+ facet_wrap(~Sample, nrow = 3, ) +
+ ggtitle("Mismatch type") +
+ theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1))
+ggsave("../report/images_read2/mt.png", width = 2250, height = 1500, units = "px")
+
+
+mtype2_2 <- apply(tp2[tp2$Sample==1, c(12:23)], 2, mean)
+mtype2_2 <- data.frame("mean" = mtype2_2)
+mtype2_2$type<-as.factor(colnames(tp2[, c(12:23)]))
+mtype2_2<-cbind(mtype2_2, "Sample" = 1)
+for (i in samples) {
+ temp <- apply(tp2[tp2$Sample==i, c(12:23)], 2, mean)
+ temp <- data.frame("mean" = temp)
+ temp$type<-as.factor(colnames(tp2[, c(12:23)]))
+ temp<-cbind(temp, "Sample" = i)
+ mtype2_2<-rbind(mtype2_2, temp)
+}
+ggplot(mtype2_2) +
+ geom_line(aes(x = type, y = mean, group = Sample)) +
+ geom_point(aes(x = type, y = mean)) +
+ #facet_wrap(~Sample, nrow = 3, scales = "free_y") +
+ facet_wrap(~Sample, nrow = 3) +
+ ggtitle("Mismatch type") +
+ theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1))
+ggsave("../report/images_read2/mt_mean.png", width = 2250, height = 1500, units = "px")
+rm(mtype2, mtype2_2)
+
+#Number of deletions per sample
+
+delets2 <- tp2[tp2$Sample==1,][,10]
+delets2 <- unlist(strsplit(delets2,""))
+delets2<- delets2[delets2 %in% c(letters, LETTERS)]
+delets2<-cbind("Nucleotide"=delets2, "Sample" = 1)
+delets2<-data.frame(delets2)
+delets2$Nucleotide <- as.factor(delets2$Nucleotide)
+for (i in samples) {
+ temp <- tp2[tp2$Sample==i,][,10]
+ temp <- unlist(strsplit(temp,""))
+ temp<- temp[temp %in% c(letters, LETTERS)]
+ temp<-cbind("Nucleotide"=temp, "Sample" = i)
+ temp<-data.frame(temp)
+ temp$Nucleotide <- as.factor(temp$Nucleotide)
+ delets2<-rbind(temp, delets2)
+}
+ggplot(delets2) +
+ geom_bar(aes(x = Nucleotide), width = 0.25, fill = "orange") +
+ facet_wrap(~Sample, nrow = 3, scales = "free_y") +
+ #ylim(0, 1500) +
+ ggtitle ("Deletes") +
+ theme(plot.title = element_text(hjust = 0.5))
+ggsave("../report/images_read2/delets.png", width = 2250, height = 1500, units = "px")
+rm (delets2)
+
+#Number of insertions per sample
+
+inserts2 <- tp2[tp2$Sample==1,][,11]
+inserts2 <- unlist(strsplit(inserts2,""))
+inserts2<- inserts2[inserts2 %in% c(letters, LETTERS)]
+inserts2<-cbind("Nucleotide"=inserts2, "Sample" = 1)
+inserts2<-data.frame(inserts2)
+inserts2$Nucleotide <- as.factor(inserts2$Nucleotide)
+for (i in samples) {
+ temp <- tp2[tp2$Sample==i,][,11]
+ temp <- unlist(strsplit(temp,""))
+ temp<- temp[temp %in% c(letters, LETTERS)]
+ temp<-cbind("Nucleotide"=temp, "Sample" = i)
+ temp<-data.frame(temp)
+ temp$Nucleotide <- as.factor(temp$Nucleotide)
+ inserts2<-rbind(temp, inserts2)
+}
+ggplot(inserts2) +
+ ggtitle("Inserts") +
+ geom_bar(aes(x = Nucleotide), width = 0.25, fill = "orange") +
+ facet_wrap(~Sample, nrow = 2, scales = "free_y") +
+ #ylim(0, 2000) +
+ theme(plot.title = element_text(hjust = 0.5))
+ggsave("../report/images_read2/inserts.png", width = 2250, height = 1500, units = "px")
+rm (inserts2)
+
+#Mean, min and max values of insertions per sample
+
+insert_mean2<-subset(tp2, Sample == 1)[,7]
+insert_mean2<-cbind("Sample"=1, t(summary(insert_mean2)))
+for (i in samples) {
+ temp<-subset(tp2, Mean_length_insert!=0 & Sample == i)[,7]
+ temp<-cbind("Sample"=i, t(summary(temp)))
+ insert_mean2<-rbind(insert_mean2, temp)
+}
+ggplot(data.frame(insert_mean2), aes(x = factor(Sample), y = Mean), size = 0.5) +
+ geom_pointrange(aes(ymin = 0, ymax = Max.)) +
+ scale_y_continuous(breaks = c(0:12), limits = c(0, 13)) +
+ geom_point(aes(x = factor(Sample), y = 0), shape = 1) +
+ geom_point(aes(x = factor(Sample), y = Max.), shape = 1) +
+ ggtitle("Insert mean length") +
+ scale_x_discrete(name = "Sample") +
+ scale_y_continuous(name = "Length", breaks = seq(0, 18, 1)) +
+ theme(plot.title = element_text(hjust = 0.5))
+ggsave("../report/images_read2/iml.png", width = 2250, height = 1500, units = "px")
+rm (insert_mean2)
+
+#Mean, min and max values of deletions per sample
+
+delete_mean2<-subset(tp2, Sample == 1)[,6]
+delete_mean2<-cbind("Sample"=1, t(summary(delete_mean2)))
+for (i in samples) {
+ temp<-subset(tp2, Mean_length_del!=0 & Sample == i)[,6]
+ temp<-cbind("Sample"=i, t(summary(temp)))
+ delete_mean2<-rbind(delete_mean2, temp)
+}
+ggplot(data.frame(delete_mean2), aes(x = factor(Sample), y = Mean), size = 0.5) +
+ geom_pointrange(aes(ymin = 0, ymax = Max.)) +
+ scale_y_continuous(breaks = c(0:22), limits = c(0, 22)) +
+ geom_point(aes(x = factor(Sample), y = 0), shape = 1) +
+ geom_point(aes(x = factor(Sample), y = Max.), shape = 1) +
+ ggtitle("Delete mean length") +
+ scale_x_discrete(name = "Sample") +
+ scale_y_continuous(name = "Length", breaks = seq(0, 12, 1)) +
+ theme(plot.title = element_text(hjust = 0.5))
+ggsave("../report/images_read2/dml.png", width = 2250, height = 1500, units = "px")
+rm(delete_mean2)
+
+#Number of mismatches per sample
+
+ggplot(tp2, aes(x=Sample, y=Count_SNPs, group = 1)) +
+ stat_summary(fun = mean, geom="bar", width = 0.7, fill = "white", color = "black", alpha = 0.7) +
+ ggtitle("Mismatches per sample") +
+ #geom_vline(xintercept = 50, linetype="dotted") +
+ theme(plot.title = element_text(hjust = 0.5))
+ggsave("../report/images_read2/mps_mean.png", width = 2250, height = 1500, units = "px")
+
+
+ggplot(tp2, aes(x=Sample, y=Count_SNPs, group = 1)) +
+ stat_summary(fun = sum, geom="point") +
+ stat_summary(fun = sum, geom="line") +
+ ggtitle("Mismatches per sample") +
+ #geom_vline(xintercept = 50, linetype="dotted") +
+ theme(plot.title = element_text(hjust = 0.5))
+ggsave("../report/images_read2/mps.png", width = 2250, height = 1500, units = "px")
+
+#Number of deletions per cycle of sequence
+
+numbers_only <- function(x) !grepl("\\D", x)
+
+dels2 <- tp2[tp2$Sample==1,][,10]
+dels2 <- unlist(strsplit(dels2, split="|",fixed = TRUE))
+dels2 <- unlist(strsplit(dels2,","))
+dels2<- dels2[numbers_only(dels2)]
+dels2<-cbind("Num"=dels2, "Sample" = 1)
+dels2<-data.frame(dels2)
+for (i in samples) {
+ temp <- tp2[tp2$Sample==i,][,10]
+ temp <- unlist(strsplit(temp, split="|",fixed = TRUE))
+ temp <- unlist(strsplit(temp,","))
+ temp<- temp[numbers_only(temp)]
+ temp<-cbind("Num"=temp, "Sample" = i)
+ temp<-data.frame(temp)
+ dels2<-rbind(temp, dels2)
+}
+dels2$Num<-as.numeric(dels2$Num)
+ggplot(dels2, aes(x = Num)) +
+ #geom_density(stat = "count", alpha = 0.5) +
+ stat_count(geom="line", position="identity") +
+ stat_count(geom="point", position="identity", size = 1) +
+ facet_wrap(~Sample, nrow = 3, scales = "free_y") +
+ #ylim(0, 1500) +
+ ggtitle ("Deletes per cycle") +
+ theme_bw()+
+ theme(plot.title = element_text(hjust = 0.5))
+ggsave("../report/images_read2/dpc.png", width = 2250, height = 1500, units = "px")
+rm(dels2)
+
+#Number of insertions per cycle of sequence
+
+ins2 <- tp2[tp2$Sample==1,][,11]
+ins2 <- unlist(strsplit(ins2, split="|",fixed = TRUE))
+ins2 <- unlist(strsplit(ins2,","))
+ins2<- ins2[numbers_only(ins2)]
+ins2<-cbind("Num"=ins2, "Sample" = 1)
+ins2<-data.frame(ins2)
+for (i in samples) {
+ temp <- tp2[tp2$Sample==i,][,11]
+ temp <- unlist(strsplit(temp, split="|",fixed = TRUE))
+ temp <- unlist(strsplit(temp,","))
+ temp<- temp[numbers_only(temp)]
+ temp<-cbind("Num"=temp, "Sample" = i)
+ temp<-data.frame(temp)
+ ins2<-rbind(temp, ins2)
+}
+ins2$Num<-as.numeric(ins2$Num)
+ggplot(ins2, aes(x = Num)) +
+ stat_count(geom="line", position="identity") +
+ stat_count(geom="point", position="identity", size = 1) +
+ facet_wrap(~Sample, nrow = 3, scales = "free_y") +
+ #ylim(0, 1500) +
+ ggtitle ("Inserts per cycle") +
+ scale_x_continuous(breaks=seq(0, 150, 25))+
+ scale_y_continuous(drop=FALSE)
+ theme_bw()+
+ theme(plot.title = element_text(hjust = 0.5))
+ggsave("../report/images_read2/ipc.png", width = 2250, height = 1500, units = "px")
+rm (ins2)
+
+#Number of deletions per sample
+
+parse_delets2 <- tp2[tp2$Sample==1,][,10]
+parse_delets2 <- unlist(strsplit(parse_delets2, split="|",fixed = TRUE))
+parse_delets2<-cbind("Num"=parse_delets2, "Sample" = 1)
+parse_delets2<-data.frame(parse_delets2)
+for (i in samples) {
+ temp <- tp2[tp2$Sample==i,][,10]
+ temp <- unlist(strsplit(temp,split="|",fixed = TRUE))
+ temp<-cbind("Num"=temp, "Sample" = i)
+ temp<-data.frame(temp)
+ parse_delets2<-rbind(temp, parse_delets2)
+}
+ggplot(parse_delets2, aes(x=Sample, group = 1)) +
+ stat_count(geom="bar", width = 0.7, fill = "white", color = "black", alpha = 0.7) +
+ ggtitle("Deletes per sample") +
+ #geom_vline(xintercept = 50, linetype="dotted") +
+ theme(plot.title = element_text(hjust = 0.5))
+ggsave("../report/images_read2/dps.png", width = 2250, height = 1500, units = "px")
+rm (parse_delets2)
+
+#Number of insertions per sample
+
+parse_inserts2 <- tp2[tp2$Sample==1,][,11]
+parse_inserts2 <- unlist(strsplit(parse_inserts2, split="|",fixed = TRUE))
+parse_inserts2<-cbind("Num"=parse_inserts2, "Sample" = 1)
+parse_inserts2<-data.frame(parse_inserts2)
+for (i in samples) {
+ temp <- tp2[tp2$Sample==i,][,11]
+ temp <- unlist(strsplit(temp,split="|",fixed = TRUE))
+ temp<-cbind("Num"=temp, "Sample" = i)
+ temp<-data.frame(temp)
+ parse_inserts2<-rbind(temp, parse_inserts2)
+}
+ggplot(parse_inserts2, aes(x=Sample, group = 1)) +
+ stat_count(geom="bar", width = 0.7, fill = "white", color = "black", alpha = 0.7) +
+ ggtitle("Inserts per sample") +
+ #geom_vline(xintercept = 50, linetype="dotted") +
+ theme(plot.title = element_text(hjust = 0.5))
+ggsave("../report/images_read2/ips.png", width = 2250, height = 1500, units = "px")
+rm (parse_inserts2, temp)
+
+#Frequency of insertion motifs per sample
+
+insert_motifs2 <- tp2[tp2$Sample==1,][,11]
+insert_motifs2 <- unlist(strsplit(insert_motifs2, split="|",fixed = TRUE))
+insert_motifs2 <- unlist(strsplit(insert_motifs2,","))
+insert_motifs2<- insert_motifs2[!numbers_only(insert_motifs2)]
+insert_motifs2<-cbind("Num"=insert_motifs2, "Sample" = 1)
+insert_motifs2<-data.frame(insert_motifs2)
+for (i in samples) {
+ temp <- tp2[tp2$Sample==i,][,11]
+ temp <- unlist(strsplit(temp, split="|",fixed = TRUE))
+ temp <- unlist(strsplit(temp,","))
+ temp<- temp[!numbers_only(temp)]
+ temp<-cbind("Num"=temp, "Sample" = i)
+ temp<-data.frame(temp)
+ insert_motifs2<-rbind(temp, insert_motifs2)
+}
+ggplot(insert_motifs2, aes(x = Num)) +
+ stat_count(geom='bar', width = 0.7, fill = "white", color = "black", alpha = 0.7) +
+ facet_wrap(~Sample, nrow=4, scales = "free_y") +
+ ggtitle("Insert motifs per sample") +
+ theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1))
+ggsave("../report/images_read2/imps.png", width = 2250, height = 2500, units = "px")
+rm (insert_motifs2, temp)
+
+#Frequency of deletion motifs per sample
+
+delete_motifs2 <- tp2[tp2$Sample==1,][,10]
+delete_motifs2 <- unlist(strsplit(delete_motifs2, split="|",fixed = TRUE))
+delete_motifs2 <- unlist(strsplit(delete_motifs2,","))
+delete_motifs2<- delete_motifs2[!numbers_only(delete_motifs2)]
+delete_motifs2<-cbind("Num"=delete_motifs2, "Sample" = 1)
+delete_motifs2<-data.frame(delete_motifs2)
+for (i in samples) {
+ temp <- tp2[tp2$Sample==i,][,10]
+ temp <- unlist(strsplit(temp, split="|",fixed = TRUE))
+ temp <- unlist(strsplit(temp,","))
+ temp<- temp[!numbers_only(temp)]
+ temp<-cbind("Num"=temp, "Sample" = i)
+ temp<-data.frame(temp)
+ delete_motifs2<-rbind(temp, delete_motifs2)
+}
+ggplot(delete_motifs2, aes(x = Num)) +
+ stat_count(geom='bar', width = 0.7, fill = "white", color = "black", alpha = 0.7) +
+ facet_wrap(~Sample, nrow=4, scales = "free_y") +
+ ggtitle("Deletes motifs per sample") +
+ theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 90, hjust = 1))
+ggsave("../report/images_read2/dmps.png", width = 2250, height = 2500, units = "px")
+rm (delete_motifs2, temp)
\ No newline at end of file