3 yıl önce · 070bd6f87d
--- a/latex/result/thesis.pdf
+++ b/latex/result/thesis.pdf
--- a/latex/tex/kapitel/a6_results.tex
+++ b/latex/tex/kapitel/a6_results.tex
@@ -1,12 +1,10 @@
 
															 \chapter{Erster Anhang: Lange Tabelle}
														
 
															-\label{a6:testsets-time}
														
 
															-
														
 
															-\label{a6:set1time}
														
 
															 \sffamily
														
 
															 \begin{footnotesize}
														
 
															+	\label{a6:compr-time}
														
 
															   \begin{longtable}[c]{ p{.2\textwidth} p{.2\textwidth} p{.2\textwidth} p{.2\textwidth}}
														
 
															-    \caption[Compression Efficiency for first test set in milliseconds]                       % Caption für das Tabellenverzeichnis
														
 
															-        {Compression duration meassured in milliseconds} % Caption für die Tabelle selbst
														
 
															+    \caption[Compression efficiency in milliseconds]                       % Caption für das Tabellenverzeichnis
														
 
															+        {\textbf{Compression duration of various tools, meassured in milliseconds}} % Caption für die Tabelle selbst
														
 
															         \\
														
 
															     \toprule
														
 
															      \textbf{ID.} & \textbf{\acs{GeCo}} & \textbf{Samtools \acs{BAM}} & \textbf{Samtools \acs{CRAM}} \\
														
@@ -32,18 +30,25 @@
 
															      File 1.19 & 586058& 960& 4106\\
														
 
															      File 1.20 & 645884& 1026& 4507\\
														
 
															      File 1.21 & 411984& 721& 3096\\
														
 
															+			&&&\\
														
 
															+			File 2.1 & 58427& 16248& 23016\\
														
 
															+			File 2.2 & 57905& 15770& 22892\\
														
 
															+			File 2.3 & 9725& 7732& 12858\\
														
 
															+			File 2.4 & 13694& 8291& 13649\\
														
 
															+			File 2.5 & 51001& 14754& 23713\\
														
 
															+			File 2.6 & 51315& 15142& 24358\\
														
 
															+			File 2.7 & 2065& 16379& 23484\\
														
 
															     \bottomrule
														
 
															   \end{longtable}
														
 
															 \end{footnotesize}
														
 
															 \rmfamily
														
 
															-\label{a6:testsets-size}
														
 
															-\label{a6:set1size}
														
 
															 \sffamily
														
 
															 \begin{footnotesize}
														
 
															-  \begin{longtable}[h]{ p{.2\textwidth} p{.2\textwidth} p{.2\textwidth} p{.2\textwidth} p{.2\textwidth}}
														
 
															-    \caption[Compression Effectivity for First Test Set in Byte]                       % Caption für das Tabellenverzeichnis
														
 
															-        \textbf{File Sizes for Different Formats in Byte} % Caption für die Tabelle selbst
														
 
															+	\label{a6:compr-size}
														
 
															+  \begin{longtable}[c]{ p{.2\textwidth} p{.2\textwidth} p{.2\textwidth} p{.2\textwidth} p{.2\textwidth}}
														
 
															+    \caption[Compression effectivity in byte]                       % Caption für das Tabellenverzeichnis
														
 
															+        {\textbf{File sizes for different formats in byte}} % Caption für die Tabelle selbst
														
 
															         \\
														
 
															     \toprule
														
 
															      \textbf{ID.} & \textbf{Uncompressed Source File} & \textbf{\acs{GeCo}} & \textbf{Samtools \acs{BAM}}& \textbf{Samtools \acs{CRAM}} \\
														
@@ -70,22 +75,7 @@
 
															 			File 2.19& 59594634& 10488207& 14616042& 14251243\\
														
 
															 			File 2.20& 65518294& 13074402& 16769658& 15510100\\
														
 
															 			File 2.21& 47488540& 7900773& 10477999& 9708258\\
														
 
															-    \bottomrule
														
 
															-  \end{longtable}
														
 
															-\end{footnotesize}
														
 
															-\rmfamily
														
 
															-
														
 
															-\label{a6:set2size}
														
 
															-\sffamily
														
 
															-\begin{footnotesize}
														
 
															-  \begin{longtable}[c]{ p{.2\textwidth} p{.2\textwidth} p{.2\textwidth} p{.2\textwidth} p{.2\textwidth}}
														
 
															-    \caption[Compression Effectivity for Second Test Set in Byte]                       % Caption für das Tabellenverzeichnis
														
 
															-        {File Sizes for Different Formats in Byte} % Caption für die Tabelle selbst
														
 
															-        \\
														
 
															-    \toprule
														
 
															-     \textbf{ID.} & \textbf{Uncompressed Source File}& \textbf{\acs{GeCo}} & \textbf{Samtools \acs{BAM}} & \textbf{Samtools \acs{CRAM}} \\
														
 
															-    \midrule
														
 
															-			%src, geco, bam and cram in byte
														
 
															+			&&&&\\
														
 
															 			File 2.1& 1246731616& 12414797& 78260121& 67130756\\
														
 
															 			File 2.2& 1261766002& 12363734& 80895953& 69649632\\
														
 
															 			File 2.3& 657946854& 7966180& 53201724& 47175349\\
														
--- a/latex/tex/kapitel/k1_introduction.tex
+++ b/latex/tex/kapitel/k1_introduction.tex
@@ -1,6 +1,8 @@
 
															 \chapter{Introduction}
														
 
															 % general information and intro
														
 
															-Understanding how things in our cosmos work, was and still is a pleasure, that the human being always wants to fulfill. Getting insights into the rawest form of organic life is possible through storing and studying information, embedded in genetic codes \cite{dna_structure}. Since live is complex, there is a lot of information, which requires a lot of memory \cite{alok17, survey}.\\
														
 
															+%Understanding how things in our cosmos work, was and still is a pleasure, that the human being always wants to fulfill. 
														
 
															+Understanding the biological code of living things, is a alsways developing taks which is important for multiple aspekts of our live. The results of reasearch in this area provides knowledge that helps development in the medical sector, agriculture and more \cite{ju_21, wang-22, mo_83}.
														
 
															+Getting insights into this biological code is possible through storing and studying information, embedded in genonmes \cite{dna_structure}. Since live is complex, there is a lot of information, which requires a lot of memory \cite{alok17, survey}.\\
														
 
															 % ...Communication with other researches means sending huge chunks of data through cables or through waves over the air, which costs time and makes raw data vulnerable to erorrs.\\
														
 
															 % compression values and goals
														
 
															 With compression algorithms and their implementation in tools, the problem of storing information got smaller. Compressed data requires less space and therefore less time to be transported over networks \cite{Shannon_1948}. This advantage is scalable, and since genetic information needs a lot of storage, even in a compressed state, improvements are welcomed \cite{moffat_arith}. Since this field is, compared to others, like computer theory which created the foundation for compression algorithms, relatively new, there is much to discover and new findings are not unusual \cite{Shannon_1948}. From some of this findings, new tools can be developed. In general they focus on increasing at least one of two factors: the speed at which data is compressed and the compresseion ratio, meaning the difference between uncompressed and compressed data \cite{moffat_arith, alok17, Shannon_1948}.\\
														
--- a/latex/tex/kapitel/k4_algorithms.tex
+++ b/latex/tex/kapitel/k4_algorithms.tex
@@ -82,7 +82,7 @@ The founder of information theory Claude Elwood Shannon described entropy and pu
 
															 Altering \ref{k4:comsys} would show how this can be applied to other technology like compression. The Information source and destination are left unchanged, one has to keep in mind, that it is possible that both are represented by the same physical actor.\\
														
 
															 Transmitter and receiver would be changed to compression/encoding and decompression/decoding. Inbetween those two, there is no signal but instead any period of time \cite{Shannon_1948}.\\
														
 
															-Shannons Entropy provides a formular to determine the 'uncertainty of a probability distribution' in a finite field.
														
 
															+Shannons Entropy provides a formula to determine the 'uncertainty of a probability distribution' in a finite field.
														
 
															 \begin{equation}\label{eq:entropy}
														
 
															 %\resizebox{.9 \textwidth}{!}
														
@@ -99,7 +99,7 @@ Shannons Entropy provides a formular to determine the 'uncertainty of a probabil
 
															 %  \label{k4:entropy}
														
 
															 %\end{figure}
														
 
															-He defined entropy as shown in figure \eqref{eq:entropy}. Let X be a finite probability space. Then $x\in X$ are possible final states of an probability experimen over X. Every state that actually occours, while executing the experiment generates infromation which is meassured in \textit{Bits} with the part of the formular displayed in \ref{eq:info-in-bit} \cite{delfs_knebl,Shannon_1948}:
														
 
															+He defined entropy as shown in figure \eqref{eq:entropy}. Let X be a finite probability space. Then $x\in X$ are possible final states of an probability experiment over X. Every state that actually occurs, while executing the experiment generates information which is meassured in \textit{Bits} with the part of the equation displayed in \ref{eq:info-in-bit} \cite{delfs_knebl,Shannon_1948}:
														
 
															 \begin{equation}\label{eq:info-in-bit}
														
 
															  log_2(\frac{1}{prob(x)}) \equiv - log_2(prob(x)).
														
@@ -219,7 +219,7 @@ As with other codings, a set of symbols must be defined. For any text constructe
 
															 \begin{itemize}
														
 
															   \item Every symbol of the alphabet is one leaf.
														
 
															   \item The right branch from every knot is marked as a 1, the left one is marked as a 0.
														
 
															-  \item Every symbol got a weight, the weight is defined by the frequency the symbol occours in the input text. This might be a fraction between 0 and 1 or an integer. In this scenario it will described as the first.
														
 
															+  \item Every symbol got a weight, the weight is defined by the frequency the symbol occurs in the input text. This might be a fraction between 0 and 1 or an integer. In this scenario it will described as the first.
														
 
															   \item The less weight a leaf has, the higher the probability is, that this node is read next in the symbol sequence.
														
 
															   \item The leaf with the lowest probability is most left and the one with the highest probability is most right in the tree. 
														
 
															 \end{itemize}
														
--- a/latex/tex/kapitel/k5_feasability.tex
+++ b/latex/tex/kapitel/k5_feasability.tex
@@ -37,7 +37,7 @@ The goal of this is, to determine a baseline for efficiency and effectivity of s
 
															 As a third point, the compliance that files were compressed losslessly should be verified. This is done by comparing the source file to a copy that got compressed and than decompressed again. If one of the two processes should operate lossy, a difference between the source file and the copy a difference in size should be recognizable. 
														
 
															 %environment, test setup, raw results
														
 
															-\section{Sever specifications and test environment}
														
 
															+\section{Server specifications and test environment}
														
 
															 To be able to recreate this in the future, relevant specifications and the commands that reveiled this information are listed in this section.\\
														
 
															 Reading from /proc/cpuinfo reveals processor specifications. Since most of the information displayed in the seven entries is redundant, only the last entry is shown. Below are relevant specifications listed:
														
--- a/latex/tex/kapitel/k6_results.tex
+++ b/latex/tex/kapitel/k6_results.tex
@@ -1,5 +1,5 @@
 
															 \chapter{Results and Discussion}
														
 
															-The tables \ref{a6:testsets-size} and \ref{a6:testsets-time} contain raw measurement values for the two goals, described in \ref{k5:goals}. The table \ref{a6:testsets-time} lists how long each compression procedure took, in milliseconds. \ref{a6:testsets-size} contains file sizes in bytes. In these tables, as well as in the other ones associated with tests in the scope of this work, the a name scheme is used, to improve readability. The filenames were replaced by \texttt{File} followed by two numbers seperated by a point. For the first test set, the number prefix \texttt{1.} was used, the second set is marked with a \texttt{2.}. For example, the fourth file of each test, in tables are named like this \texttt{File 1.4} and \texttt{File 2.4}. The name of the associated source file for the first set is:
														
 
															+The tables \ref{a6:compr-size} and \ref{a6:compr-time} contain raw measurement values for the two goals, described in \ref{k5:goals}. The table \ref{a6:compr-time} lists how long each compression procedure took, in milliseconds. \ref{a6:compr-size} contains file sizes in bytes. In these tables, as well as in the other ones associated with tests in the scope of this work, the a name scheme is used, to improve readability. The filenames were replaced by \texttt{File} followed by two numbers seperated by a point. For the first test set, the number prefix \texttt{1.} was used, the second set is marked with a \texttt{2.}. For example, the fourth file of each test, in tables are named like this \texttt{File 1.4} and \texttt{File 2.4}. The name of the associated source file for the first set is:
														
 
															 \texttt{Homo\_sapiens.GRCh38.dna.chromosome.\textbf{4}.fa}
														
@@ -215,13 +215,13 @@ In both tables \ref{k6:recal-time} and \ref{k6:recal-size} the already identifie
 
															 \section{View on Possible Improvements}
														
 
															 So far, this work went over formats for storing genomes, methods to compress files (in mentioned formats) and through tests where implementations of named algorithms compress several files and analyzed the results. The test results show that \acs{GeCo} provides a better compression ratio than Samtools and takes more time to run through. So in this testrun, implementations of arithmetic coding resulted in a better compression ratio than Samtools \acs{BAM} with the mix of huffman coding and \acs{LZ77}, or Samtools custom compression format \acs{CRAM}. Comparing results in \autocite{survey}, supports this statement. This study used \acs{FASTA}/Multi-FASTA files from 71MB to 166MB and found that \acs{GeCo} had a variating compression ratio from 12.34 to 91.68 times smaller than the input reference and also resulted in long runtimes up to over 600 minutes \cite{survey}. Since this study focused on another goal than this work and therefore used different test variables and environments, the results can not be compared. But what can be taken from this, is that arithmetic coding, at least in \acs{GeCo} is in need of a runtime improvement.\\
														
 
															-The actual mathematical proove of such an improvemnt and its implementation can not be covered because it would to beyond scope. But in order to set up a foundation for this task, the rest of this work will consist of considerations and problem analysis, which should be thought about and dealt with to develop a improvement.
														
 
															+The actual mathematical proove of such an improvement, the planing of a implementation and the development of a proof of concept, will be a rewarding but time and ressource comsuming project. Dealing with those tasks would go beyond the scope of this work. But in order to widen the foundation for this tasks, the rest of this work will consist of considerations and problem analysis, which should be thought about and dealt with to develop a improvement.
														
 
															 S.V. Petoukhov described his findings about the distribution of nucleotides \cite{pet21}. With the probability of one nucleotide, in a sequence of sufficient length, information about the direct neighbours is revealed. For example, with the probability of \texttt{C}, the probabilities for sets (n-plets) of any nucleotide \texttt{N}, including \texttt{C} can be determined without counting them \cite{pet21}.\\
														
 
															 %\%C ≈ Σ\%CN ≈ Σ\%NС ≈ Σ\%CNN ≈ Σ\%NCN ≈ Σ\%NNC ≈ Σ\%CNNN ≈ Σ\%NCNN ≈ Σ\%NNCN ≈ Σ\%NNNC\\
														
 
															 % begin optimization 
														
 
															-Considering this and the meassured results, an improvement in the arithmetic coding process and therefore in \acs{GeCo}s efficiency, would be a good start to equalize the great gap in the compression duration. Combined with a tool that is developed with todays standards, there is a possibility that even greater improvements could be archived.\\
														
 
															+Considering this and the measured results, an improvement in the arithmetic coding process and therefore in \acs{GeCo}s efficiency, would be a good start to equalize the great gap in the compression duration. Combined with a tool that is developed with todays standards, there is a possibility that even greater improvements could be archived.\\
														
 
															 % simple theoretical approach
														
 
															 How would a theoretical improvement approach look like? As described in \ref{k4:arith}, entropy coding requires to determine the probabilies of each symbol in the alphabet. The simplest way to do that, is done by parsing the whole sequence from start to end and increasing a counter for each nucleotide that got parsed. 
														
 
															 With new findings discovered by Petoukhov in cosideration, the goal would be to create an entropy coding implementation that beats current implementation in the time needed to determine probabilities. A possible approach would be that the probability of one nucleotide can be used to determine the probability of other nucelotides, by a calculation rather than the process of counting each one.
														
@@ -267,7 +267,7 @@ If there space for improvement in the parsing/counting process, what problems ne
 
															 \begin{itemize}
														
 
															 	\item reducing one process by adding aditional code must be estimated and set into relation.
														
 
															-	\item for a tool that does not feature multithreading, how would multithreading affect the improvement reulst?
														
 
															+	\item for a tool that does not feature multithreading, how would multithreading affect the improvement results?
														
 
															 \end{itemize}
														
 
															 % todo petoukhov just said T = AT+GT+CT+TT = %NT and %T = %TN
														
@@ -284,6 +284,7 @@ The fact that there are obviously chains of repeating nucleotides in genomes. Fo
 
															 Without determining probabilities, one can see that the amount of \texttt{A}s outnumbers \texttt{T}s and neither \texttt{C} nor \texttt{G} are present. With the whole 1.2 gigabytes, the distribution will align more, but by cutting out a subsection, of relevant size, with unequal distributions will have an impact on the probabilities of the whole sequence. If a greater sequence would lead to a more equal distribution, this knowledge could be used to help determining distributions on subsequences of one with equaly distributed probabilities.
														
 
															 % length cutting
														
 
															+% todo erweitern um vergleiche zu survey work
														
 
															 % how is data interpreted
														
 
															 % why did the tools result in this, what can we learn
														
 
															 % improvements
														
--- a/latex/tex/literatur.bib
+++ b/latex/tex/literatur.bib
@@ -502,4 +502,38 @@
 
															   url   = {https://spdx.org/licenses/MIT.html},
														
 
															 }
														
 
															+@Article{wang_22,
														
 
															+  author       = {Si-Wei Wang and Chao Gao and Yi-Min Zheng and Li Yi and Jia-Cheng Lu and Xiao-Yong Huang and Jia-Bin Cai and Peng-Fei Zhang and Yue-Hong Cui and Ai-Wu Ke},
														
 
															+  date         = {2022-02},
														
 
															+  journaltitle = {Molecular Cancer},
														
 
															+  title        = {Current applications and future perspective of {CRISPR}/Cas9 gene editing in cancer},
														
 
															+  doi          = {10.1186/s12943-022-01518-8},
														
 
															+  number       = {1},
														
 
															+  volume       = {21},
														
 
															+  publisher    = {Springer Science and Business Media {LLC}},
														
 
															+}
														
 
															+
														
 
															+@Article{ju_21,
														
 
															+  author       = {Philomin Juliana and Ravi Prakash Singh and Jesse Poland and Sandesh Shrestha and Julio Huerta-Espino and Velu Govindan and Suchismita Mondal and Leonardo Abdiel Crespo-Herrera and Uttam Kumar and Arun Kumar Joshi and Thomas Payne and Pradeep Kumar Bhati and Vipin Tomar and Franjel Consolacion and Jaime Amador Campos Serna},
														
 
															+  date         = {2021-03},
														
 
															+  journaltitle = {Scientific Reports},
														
 
															+  title        = {Elucidating the genetics of grain yield and stress-resilience in bread wheat using a large-scale genome-wide association mapping study with 55,568 lines},
														
 
															+  doi          = {10.1038/s41598-021-84308-4},
														
 
															+  number       = {1},
														
 
															+  volume       = {11},
														
 
															+  publisher    = {Springer Science and Business Media {LLC}},
														
 
															+}
														
 
															+
														
 
															+@Article{mo_83,
														
 
															+  author       = {Arno G. Motulsky},
														
 
															+  date         = {1983-01},
														
 
															+  journaltitle = {Science},
														
 
															+  title        = {Impact of Genetic Manipulation on Society and Medicine},
														
 
															+  doi          = {10.1126/science.6336852},
														
 
															+  number       = {4581},
														
 
															+  pages        = {135--140},
														
 
															+  volume       = {219},
														
 
															+  publisher    = {American Association for the Advancement of Science ({AAAS})},
														
 
															+}
														
 
															+
														
 
															 @Comment{jabref-meta: databaseType:biblatex;}