il y a 3 ans · 564bca0a42
--- a/latex/tex/Makefile
+++ b/latex/tex/Makefile
@@ -7,7 +7,7 @@ MAKEINDEX = makeindex
 
				 #INCLUDES = kapitel/abkuerzungen.tex docinfo.tex preambel.tex titelblatt.tex literatur.bib bilder/*.pdf
			
 
				 INCLUDES = kapitel/abkuerzungen.tex docinfo.tex preambel.tex titelblatt.tex literatur.bib bilder/*.pdf
			
 
				 
			
 
				-CHAPTERS = kapitel/k*.tex kapitel/anhang*.tex
			
 
				+CHAPTERS = kapitel/k*.tex kapitel/a*.tex
			
 
				 
			
 
				 define latex-it
			
 
				 $(eval FILE = $(firstword $^))
			
--- a/latex/tex/kapitel/a0.tex
+++ b/latex/tex/kapitel/a0.tex
@@ -0,0 +1,2 @@
 
				+\vfill{}
			
 
				+{\tiny :wq}
			
--- a/latex/tex/kapitel/a5_feasability.tex
+++ b/latex/tex/kapitel/a5_feasability.tex
@@ -1,7 +1,8 @@
 
				-\label{a5:cpu}
			
 
				+\chapter{Test Environment and Server Specification}
			
 
				 \textbf{CPU specification. Due to redundance, the information is limited to the last core, beginning at:} processor : 7\\
			
 
				 \noindent
			
 
				 \begin{lstlisting}[language=bash]
			
 
				+\label{a5:cpu}
			
 
				 	cat /proc/cpuinfo 
			
 
				 \end{lstlisting}
			
 
				 processor : 0\\
			
@@ -35,29 +36,31 @@ cache\_alignment	: 64\\
 
				 address sizes	: 36 bits physical, 48 bits virtual\\
			
 
				 power management:\\
			
 
				 
			
 
				-\label{a5:pkg}
			
 
				 \textbf{manually installed packages:}
			
 
				-autoconf\\
			
 
				-automake\\
			
 
				-bzip2\\
			
 
				-cmake\\
			
 
				-gcc\\
			
 
				-git\\
			
 
				-htop\\
			
 
				-libbz2-dev\\
			
 
				-libcurl4-gnutls-dev\\
			
 
				-libhts-dev\\
			
 
				-libhtscodecs2\\
			
 
				-liblzma-dev\\
			
 
				-libncurses5-dev\\
			
 
				-libomp-dev\\
			
 
				-libssl-dev\\
			
 
				-zlib1g-dev\\
			
 
				-openssh-client\\
			
 
				-perl\\
			
 
				-rsync\\
			
 
				-screen\\
			
 
				-sudo\\
			
 
				-ufw\\
			
 
				-vim\\
			
 
				-wget\\
			
 
				+\begin{itemize}
			
 
				+\label{a5:pkg}
			
 
				+	\item autoconf\\
			
 
				+	\item automake\\
			
 
				+	\item bzip2\\
			
 
				+	\item cmake\\
			
 
				+	\item gcc\\
			
 
				+	\item git\\
			
 
				+	\item htop\\
			
 
				+	\item libbz2-dev\\
			
 
				+	\item libcurl4-gnutls-dev\\
			
 
				+	\item libhts-dev\\
			
 
				+	\item libhtscodecs2\\
			
 
				+	\item liblzma-dev\\
			
 
				+	\item libncurses5-dev\\
			
 
				+	\item libomp-dev\\
			
 
				+	\item libssl-dev\\
			
 
				+	\item zlib1g-dev\\
			
 
				+	\item openssh-client\\
			
 
				+	\item perl\\
			
 
				+	\item rsync\\
			
 
				+	\item screen\\
			
 
				+	\item sudo\\
			
 
				+	\item ufw\\
			
 
				+	\item vim\\
			
 
				+	\item wget\\
			
 
				+\end{itemize}
			
--- a/latex/tex/kapitel/a6_results.tex
+++ b/latex/tex/kapitel/a6_results.tex
@@ -1,7 +1,6 @@
 
				-\chapter{Erster Anhang: Lange Tabelle}
			
 
				+\chapter{Raw Test Results Structured in Tables}
			
 
				 \sffamily
			
 
				 \begin{footnotesize}
			
 
				-	\label{a6:compr-time}
			
 
				   \begin{longtable}[c]{ p{.2\textwidth} p{.2\textwidth} p{.2\textwidth} p{.2\textwidth}}
			
 
				     \caption[Compression efficiency in milliseconds]                       % Caption für das Tabellenverzeichnis
			
 
				         {\textbf{Compression duration of various tools, measured in milliseconds}} % Caption für die Tabelle selbst
			
@@ -39,13 +38,13 @@
 
				 			File 2.6 & 51315& 15142& 24358\\
			
 
				 			File 2.7 & 2065& 16379& 23484\\
			
 
				     \bottomrule
			
 
				+		\label{a6:compr-time}
			
 
				   \end{longtable}
			
 
				 \end{footnotesize}
			
 
				 \rmfamily
			
 
				 
			
 
				 \sffamily
			
 
				 \begin{footnotesize}
			
 
				-	\label{a6:compr-size}
			
 
				   \begin{longtable}[c]{ p{.2\textwidth} p{.2\textwidth} p{.2\textwidth} p{.2\textwidth} p{.2\textwidth}}
			
 
				     \caption[Compression effectivity in byte]                       % Caption für das Tabellenverzeichnis
			
 
				         {\textbf{File sizes for different formats in byte}} % Caption für die Tabelle selbst
			
@@ -84,6 +83,7 @@
 
				 			File 2.6& 1123124224& 12265535& 88147227& 77826446\\
			
 
				 			File 2.7& 1300825946& 12450651& 75860986& 60239362\\
			
 
				     \bottomrule
			
 
				+		\label{a6:compr-size}
			
 
				   \end{longtable}
			
 
				 \end{footnotesize}
			
 
				 \rmfamily
			
--- a/latex/tex/kapitel/a_online.tex
+++ b/latex/tex/kapitel/a_online.tex
@@ -0,0 +1 @@
 
				+\chapter{Visual Persistance of Used Online Sources}
			
--- a/latex/tex/kapitel/k5_feasability.tex
+++ b/latex/tex/kapitel/k5_feasability.tex
@@ -177,13 +177,15 @@ Since there are multiple open \ac{FTP} servers which distribute a variety of fil
 
				 were chosen \cite{ftp-ensembl}. This sample includes 20 chromosomes, whereby considering the filenames, one chromosome is contained in each single file. After retrieving and unpacking the files, write privileges on them was withdrawn. So no tool could alter any file contents, without sufficient permission.
			
 
				 Finding a second, bigger set happened to be more complicated. \acs{FTP} offers no fast, reliable way to sort files according to their size, regardless of their position. Since available servers \cite{ftp-ensembl, ftp-ncbi, ftp-igsr} offer several thousand files, stored in variating, deep directory structures, mapping filesize, filetype and file path takes too much time and resources for the scope of this work. This problematic combined with a easily triggered overflow in the samtools library, resulted in a set of several, manualy searched and tested \acs{FASTq} files. Compared to the first set, there is a noticable lack of quantity, but the filesizes happen to be of a fortunate distribution. With pairs of two files in the ranges of 0.6, 1.1, 1.2 and one file with a size of 1.3 gigabyte, effects on scaling sizes should be clearly visible.\\
			
 
				  
			
 
				-% todo make sure this needs to stay.
			
 
				+\mycomment{
			
 
				+%make sure this needs to stay.
			
 
				 \noindent Following tools and parameters where used in this process:
			
 
				 \begin{lstlisting}[language=bash]
			
 
				   \$ wget http://ftp.ensembl.org/pub/release-107/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.{2,3,4,5,6,7,8,9,10}.fa.gz
			
 
				   \$ gzip -d ./*
			
 
				   \$ chmod -w ./*
			
 
				 \end{lstlisting}
			
 
				+}
			
 
				 
			
 
				 The chosen tools are able to handle the \acs{FASTA} format. However Samtools must convert \acs{FASTA} files into their \acs{SAM} format bevor the file can be compressed. The compression will firstly lead to an output with \acs{BAM} format, from there it can be compressed further into a \acs{CRAM} file. For \acs{CRAM} compression, the time needed for each step, from converting to two compressions, is summed up and displayed as one. For the compression time into the \acs{BAM} format, just the conversion and the single compression time is summed up. The conversion from \acs{FASTA} to \acs{SAM} is not displayed in the results. This is due to the fact that this is no compression process, and therefor has no value to this work.\\
			
 
				 Even though \acs{SAM} files are not compressed, there can be a small but noticeable difference in size between the files in each format. Since \acs{FASTA} should store less information, by leaving out quality scores, this observation was counterintuitive. Comparing the first few lines showed two things: the header line were altered and newlines were removed. The alteration of the header line would result in just a few more bytes. To verify, no information was lost while converting, both files were temporary stripped from metadata and formatting, so the raw data of both files can be compared. Using \texttt{diff} showed no differences between the stored characters in each file.\\
			
--- a/latex/tex/kapitel/k6_results.tex
+++ b/latex/tex/kapitel/k6_results.tex
@@ -217,7 +217,7 @@ In both tables \ref{k6:recal-time} and \ref{k6:recal-size} the already identifie
 
				 So far, this work went over formats for storing genomes, methods to compress files (in mentioned formats) and through tests where implementations of named algorithms compress several files and analyzed the results. The test results show that \acs{GeCo} provides a better compression ratio than Samtools and takes more time to run through. So in this testrun, implementations of arithmetic coding resulted in a better compression ratio than Samtools \acs{BAM} with the mix of Huffman coding and \acs{LZ77}, or Samtools custom compression format \acs{CRAM}. Comparing results in \autocite{survey}, supports this statement. This study used \acs{FASTA}/Multi-FASTA files from 71MB to 166MB and found that \acs{GeCo} had a variating compression ratio from 12.34 to 91.68 times smaller than the input reference and also resulted in long runtimes up to over 600 minutes \cite{survey}. Since this study focused on another goal than this work and therefore used different test variables and environments, the results can not be compared. But what can be taken from this, is that arithmetic coding, at least in \acs{GeCo} is in need of a runtime improvement.\\
			
 
				 The actual mathematical prove of such an improvement, the planing of a implementation and the development of a proof of concept, will be a rewarding but time and ressource comsuming project. Dealing with those tasks would go beyond the scope of this work. But in order to widen the foundation for this tasks, the rest of this work will consist of considerations and problem analysis, which should be thought about and dealt with to develop a improvement.
			
 
				 
			
 
				-S.V. Petoukhov described his prepublished findings, which are under ongoing research, about the distribution of nucleotides \cite{pet21}. With the probability of one nucleotide, in a sequence of sufficient length, estimations about the direct neighbours of this nucleotide might be revealed. This can be illustrated in this formula \cite{pet12}:\\
			
 
				+S.V. Petoukhov described his prepublished findings, which are under ongoing research, about the distribution of nucleotides \cite{pet21}. With the probability of one nucleotide, in a sequence of sufficient length, estimations about the direct neighbours of this nucleotide might be revealed. This can be illustrated in this formula \cite{pet21}:\\
			
 
				 
			
 
				 \texttt{\% C $\approx$ $\sum$\%CN $\approx$ $\sum$\%NC $\approx$ $\sum$\%CNN $\approx$ $\sum$\%NCN $\approx$ $\sum$\%NNC $\approx$ $\sum$\%CNNN $\approx$ $\sum$\%NCNN $\approx$ $\sum$\%NNCN $\approx$ $\sum$\%NNNC ...}
			
 
				 
			
@@ -228,11 +228,11 @@ Further he described that there might be a simliarity between nucleotides.
 
				 \begin{figure}[H]
			
 
				   \centering
			
 
				   \includegraphics[width=15cm]{k6/pet-prob.png}
			
 
				-  \caption{Probabilities for \texttt{A, C, G and T} in \texttt{Homo sapiens chromosome 1, GRCh38.p14 Primary Assembly} \cite{pet12, ftp-ncbi}.}
			
 
				+  \caption{Probabilities for \texttt{A, C, G and T} in \texttt{Homo sapiens chromosome 1, GRCh38.p14 Primary Assembly} \cite{pet21, ftp-ncbi}.}
			
 
				   \label{k6:pet-prob}
			
 
				 \end{figure}
			
 
				 
			
 
				-The exemplaric probabilities he displayed are reprinted in \ref{k6:pet-prob}. Noteable are the similarities in the distirbution of \%A and \%G as well as in \%C and \%T. They align until the third digit after the decimal point. According to Petoukhov, this regularity is found in the genome of humans, some anmials, plants, bacteria and more \cite{pet12}.\\
			
 
				+The exemplaric probabilities he displayed are reprinted in \ref{k6:pet-prob}. Noteable are the similarities in the distirbution of \%A and \%G as well as in \%C and \%T. They align until the third digit after the decimal point. According to Petoukhov, this regularity is found in the genome of humans, some anmials, plants, bacteria and more \cite{pet21}.\\
			
 
				 % begin optimization 
			
 
				 Considering this and the measured results, an improvement in the arithmetic coding process and therefore in \acs{GeCo}s efficiency, would be a good start to equalize the great gap in the compression duration. Combined with a tool that is developed with todays standards, there is a possibility that even greater improvements could be archived.\\
			
 
				 % simple theoretical approach
			
@@ -316,13 +316,53 @@ The fact that there are obviously chains of repeating nucleotides in genomes. Fo
 
				 
			
 
				 \texttt{AACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTTAACCC} 
			
 
				 
			
 
				-Without determining probabilities, one can see that the amount of \texttt{C}s outnumbers \texttt{T}s and \texttt{A}s. With the whole 130\acs{MB}, the probability distribution will align more. The following values have been roundet down: 
			
 
				+Without determining probabilities, one can see that the amount of \texttt{C}s outnumbers \texttt{T}s and \texttt{A}s. With the whole 133258320 symbols 130\acs{MB}, the probability distribution will align more. The following values have been roundet down: \texttt{A $\approx$ 0.291723, C $\approx$ 0.207406, G $\approx$ 0.208009, T $\approx$ 0.2928609}. The pattern described by S. Petoukhov is recognizable. But by cutting out a subsection, of relevant size, with unequal distributions will have an impact on the probabilities of the whole sequence. 
			
 
				+If a greater sequence would lead to a more equal distribution, this knowledge could be used to help determining distributions on subsequences of one with equaly distributed probabilities.\\
			
 
				+There are some rules that apply to any whole chromosom sequence as well as to subsequences rerefenced by \texttt{S}. With the knowledge about lenght \texttt{len(S)} and the frequency and position of one symbol e.g. \texttt{C} represented as \texttt{|C|}, rules about the enveloping sequence can be derived. The arithmetic operations on symbols $\cdot$ for consecutive repetitions and $+$ for the concatination are used. For x and y as the ammount of nucleotides before the first and after the last \texttt{C}:
			
 
				 
			
 
				-\texttt{A $\approx$ 0.291723, C $\approx$ 0.207406, G $\approx$ 0.208009, T $\approx$ 0.2928609}
			
 
				+\begin{itemize}
			
 
				+	\item $\frac{len(S)}{x/y-1}\cdot (|C| -1)$ determines the ammount of $(x \cdot N) + C$ and $C + (y \cdot N)$ sequences $\in S$. 
			
 
				+	\item The longest chain starting with \texttt{C} is $C + N \cdot (len(S) - x - 1)$.
			
 
				+	\item The longest chain ending with \texttt{C} is $(len(S) - y -1) \cdot N + C$.
			
 
				+	\item There are $(|C| - 1)$ occurrences of $(x + 1) \cdot N + C$ and an equal ammount of $C + N \cdot (y + 1)$.
			
 
				+\end{itemize}
			
 
				+Those statements might seem trivial to some, but possibly help other to clarify the boundaries on Petoukhov's rules. Also, they represent the thought process of this works last section.\\
			
 
				 
			
 
				-Nevertheless, this 133258320 characters show the pattern described by S. Petoukhov.
			
 
				-But by cutting out a subsection, of relevant size, with unequal distributions will have an impact on the probabilities of the whole sequence. 
			
 
				-If a greater sequence would lead to a more equal distribution, this knowledge could be used to help determining distributions on subsequences of one with equaly distributed probabilities.
			
 
				-% length cutting
			
 
				+\mycomment{
			
 
				 % todo erweitern um vergleiche zu survey work
			
 
				-Besides multithreading, there are other methods that could impact improvement approaches. To get a bit more specific, ussage of entropy coding in reference free compression. This methods use structural properties, like repetitions or palindromes to apply a dictionary coding algorithm like \acs{LZ77} on the sequence. The parts that do not show any sign of forward or backward repetition get compressed using airhtmetic coding \cite{survey}. When this method is used, working with the probabilities of the whole genome is not purposeful. In the example subsequence out of \texttt{File 1.10}, no \texttt{G} is present. Compressing the subsequence with a additional interval of >0.2 for a symbol that would never get encoded, would be a waste of resources.\\
			
 
				+Besides multithreading, there are other methods that could impact improvement approaches. Like the ussage of entropy coding in reference free compression, in combination with other compression solutions. This methods use structural properties, like repetitions or palindromes to apply a dictionary coding algorithm like \acs{LZ77} on the sequence. The parts that do not show any sign of forward or backward repetition get compressed using airhtmetic coding \cite{survey}. When this method is used, working with the probabilities of the whole genome is not purposeful. In the example subsequence out of \texttt{File 1.10}, no \texttt{G} is present. Compressing the subsequence with a additional interval of >0.2 for a symbol that would never get encoded, would be a waste of resources.\\
			
 
				+}
			
 
				+\mycomment{
			
 
				+Summarizing relevant points to end this work in a final conclusion and the view in a possible future:
			
 
				+- coding algorithms did not change drastically, in the last deccades 
			
 
				+- improvements are archived by additions to existing algorithms and combining multiple algorithms for specific tasks
			
 
				+- tests and comparings shown that arithmetic coding lacks in efficiency
			
 
				+
			
 
				+possible future events:
			
 
				+best case
			
 
				+- improvement through exact determination of whole porb distribution
			
 
				+- petoukov is right
			
 
				+=> improvements of probability determination for universal species whole chromosom sequences 
			
 
				+=> possible further goal: optimization of prob determination for chromosome sections
			
 
				+
			
 
				+bad case
			
 
				+- exact determination of all probabilities are not feasible
			
 
				+ -> using A$\approx$G$\approx$0.28 and T=C=0.22 to estimate the probability and gather additional information to aproximate the real distibution
			
 
				+- petoukov was wrong about universality of his rules
			
 
				+ -> this still might work for a variety of genomes: all human chromosomes, mice, plants...
			
 
				+}
			
 
				+Before resulting in a final conclusion, a quick summary of important points:
			
 
				+\begin{itemize}
			
 
				+	\item coding algorithms did not change drastically, in the last deccades 
			
 
				+	\item improvements are archived by additions to existing algorithms and combining multiple algorithms for specific tasks
			
 
				+ 	\item tests and comparings shown that arithmetic coding lacks in efficiency
			
 
				+\end{itemize}
			
 
				+The goal for this new optimization approach is clearly defined. Also a possible test environment and measuremnet techniques that indicate a success have been testes, in this work as well as in cited works \cite{survey}. Considering how other improvements were implemented in the past, shows that the way this approach would work is feasible \cite{moffat_arith}. This combined with the last point leads to assumption that there is a realistic chance to optimize entropy coding, specifically the arithmetic coding algorithm.\\
			
 
				+This assumption will consolidate by viewing best- and worst-case szenarios that could result from further research. Two variables are taken into this thought process. One would be the success of the optimization approach and the other if Petoukhov's findings develop favorable:
			
 
				+The best case would be described as optimization through exact determination of the whole probability distribution is possible and Petoukhov's findings prove that his rules are universal for genomes between living organisms. This would result in a faster compression with entropy coding. Depending on the dimension either a tool that is implementing entropy coding only or a hybrid tool, with improved efficiency in its entropy coding algorithms would set the new \texttt{state of the art}.\\
			
 
				+In a worst case szenario, the exact determination of probability distributions would not be possible. This would mean more research should be done in approximating probability distibutions. Additionally how the use of $A\approx G \approx 0.2914$ and $C\approx T\approx 0.2086$ could provide efficiency improvements in reference-free compression of whole chromosomes and general improvements in the compression of a reference genome in reference-based compression solutions \cite{survey}.\\
			
 
				+Also Petoukov would be wrong about the universality of the defined rules, considering the examplary caculation of probability determination of \texttt{File 1.10} a concern that his rules do not apply to any genomes, and he had a miscalculation is out of the way. This would limit the range of the impact an improvement would create. The combination of which genomes follow Petoukov's rules and a list of tools that specialize on the compression of those would set the new goal for an optimization approach.\\
			
 
				+
			
 
				+%From this perspective, how favorable research turns out does not determine if there will be an impact but just how far it will reach.
			
 
				+So, how favorable research turns out does not determine if there will be an impact but just how far it will reach.
			
 
				+
			
--- a/latex/tex/thesis.tex
+++ b/latex/tex/thesis.tex
@@ -199,5 +199,7 @@
 
				 \appendix
			
 
				 \input{kapitel/a5_feasability}
			
 
				 \input{kapitel/a6_results}
			
 
				+\input{kapitel/a_online}
			
 
				+%\input{kapitel/a0}
			
 
				 
			
 
				 \end{document}
		`@@ -0,0 +1 @@`
		`+\chapter{Visual Persistance of Used Online Sources}`