diff --git a/TracingPaper.aux b/TracingPaper.aux index 089089d..5774299 100644 --- a/TracingPaper.aux +++ b/TracingPaper.aux @@ -29,51 +29,63 @@ \citation{Anderson2004} \citation{Roselli2000} \citation{Vogels1999} +\citation{Orosz2013} +\citation{Dabir2008} +\citation{Skopko2012} +\citation{Orosz2013} +\citation{PFRING} +\citation{Ellard2003} +\citation{Anderson2004} \citation{Leung2008} \@writefile{toc}{\contentsline {subsection}{\numberline {1.4}The Need for a New Study}{3}} \newlabel{The Need for a New Study}{{1.4}{3}} \@writefile{toc}{\contentsline {section}{\numberline {2}Methodology}{3}} \newlabel{Methodology}{{2}{3}} -\@writefile{toc}{\contentsline {subsection}{\numberline {2.1}Effects of System Setup on Tracing}{3}} -\newlabel{Effects of System Setup on Tracing}{{2.1}{3}} +\@writefile{toc}{\contentsline {subsection}{\numberline {2.1}System Limitations}{3}} +\newlabel{System Limitations}{{2.1}{3}} \@writefile{toc}{\contentsline {subsection}{\numberline {2.2}Main Challenges}{3}} \newlabel{Main Challenges}{{2.2}{3}} \@writefile{toc}{\contentsline {subsection}{\numberline {2.3}Interpretation of Data}{3}} \newlabel{Interpretation of Data}{{2.3}{3}} -\@writefile{toc}{\contentsline {subsection}{\numberline {2.4}Selective Importance of Information}{3}} -\newlabel{Selective Importance of Information}{{2.4}{3}} -\@writefile{toc}{\contentsline {subsection}{\numberline {2.5}Arbitrary Distribution of Collected Information}{3}} -\newlabel{Arbitrary Distribution of Collected Information}{{2.5}{3}} +\citation{Ellard2003} +\@writefile{toc}{\contentsline {subsection}{\numberline {2.4}Scope of Interpretation}{4}} +\newlabel{Scope of Interpretation}{{2.4}{4}} \@writefile{toc}{\contentsline {section}{\numberline {3}Tracing System}{4}} \newlabel{Tracing System}{{3}{4}} -\@writefile{toc}{\contentsline {subsection}{\numberline {3.1}Different Stages of Trace}{4}} -\newlabel{Different Stages of Trace}{{3.1}{4}} -\@writefile{toc}{\contentsline {subsection}{\numberline {3.2}About the Systems Being Traced}{4}} -\newlabel{About the Systems Being Traced}{{3.2}{4}} -\@writefile{toc}{\contentsline {section}{\numberline {4}Trace Analysis}{4}} -\newlabel{Trace Analysis}{{4}{4}} +\@writefile{toc}{\contentsline {subsection}{\numberline {3.1}Stages of Trace}{4}} +\newlabel{Stages of Trace}{{3.1}{4}} +\@writefile{toc}{\contentsline {subsubsection}{\numberline {3.1.1}Capture}{4}} +\newlabel{Capture}{{3.1.1}{4}} +\@writefile{toc}{\contentsline {subsubsection}{\numberline {3.1.2}Collection}{4}} +\newlabel{Collection}{{3.1.2}{4}} +\@writefile{toc}{\contentsline {subsubsection}{\numberline {3.1.3}Dissection/Analysis}{4}} +\newlabel{Dissection/Analysis}{{3.1.3}{4}} \citation{MS-CIFS} -\@writefile{toc}{\contentsline {subsection}{\numberline {4.1}Request/Response Command Pairing}{5}} -\newlabel{Request/Response Command Pairing}{{4.1}{5}} -\@writefile{toc}{\contentsline {subsection}{\numberline {4.2}IAT Calculation for Request/Response Pairs}{5}} -\newlabel{IAT Calculation for Request/Response Pairs}{{4.2}{5}} -\@writefile{toc}{\contentsline {subsection}{\numberline {4.3}SMB}{5}} -\newlabel{SMB}{{4.3}{5}} -\@writefile{toc}{\contentsline {subsection}{\numberline {4.4}Other (e.g. HTML)}{5}} -\newlabel{Other (e.g. HTML)}{{4.4}{5}} -\@writefile{toc}{\contentsline {subsection}{\numberline {4.5}Process ID Tracking}{5}} -\newlabel{Process ID Tracking}{{4.5}{5}} -\@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces \relax \fontsize {9}{11}\selectfont \abovedisplayskip 8.5\p@ plus3\p@ minus4\p@ \abovedisplayshortskip \z@ plus2\p@ \belowdisplayshortskip 4\p@ plus2\p@ minus2\p@ \def \leftmargin \leftmargini \topsep 4\p@ plus2\p@ minus2\p@ \parsep 2\p@ plus\p@ minus\p@ \itemsep \parsep {\leftmargin \leftmargini \topsep 4\p@ plus2\p@ minus2\p@ \parsep 2\p@ plus\p@ minus\p@ \itemsep \parsep }\belowdisplayskip \abovedisplayskip \itshape Rough Sketch of Communication}}{5}} -\newlabel{fig-communication}{{1}{5}} +\@writefile{toc}{\contentsline {section}{\numberline {4}Trace Analysis}{5}} +\newlabel{Trace Analysis}{{4}{5}} +\@writefile{toc}{\contentsline {subsection}{\numberline {4.1}SMB}{5}} +\newlabel{SMB}{{4.1}{5}} +\@writefile{toc}{\contentsline {subsection}{\numberline {4.2}ID Tracking}{5}} +\newlabel{ID Tracking}{{4.2}{5}} \citation{Anderson2004} \citation{Anderson2004} \citation{Anderson2004} -\@writefile{toc}{\contentsline {subsubsection}{\numberline {4.5.1}event\_data Structure Tracking}{6}} -\newlabel{event_data Structure Tracking}{{4.5.1}{6}} -\@writefile{toc}{\contentsline {subsection}{\numberline {4.6}Run Patterns}{6}} -\newlabel{Run Patterns}{{4.6}{6}} -\@writefile{toc}{\contentsline {subsection}{\numberline {4.7}Locating Performance Bottlenecks}{6}} -\newlabel{Locating Performance Bottlenecks}{{4.7}{6}} +\@writefile{toc}{\contentsline {subsection}{\numberline {4.3}Run Patterns}{6}} +\newlabel{Run Patterns}{{4.3}{6}} +\@writefile{toc}{\contentsline {subsection}{\numberline {4.4}Locating Performance Bottlenecks}{6}} +\newlabel{Locating Performance Bottlenecks}{{4.4}{6}} +\@writefile{toc}{\contentsline {subsection}{\numberline {4.5}Other (e.g. HTML)}{6}} +\newlabel{Other (e.g. HTML)}{{4.5}{6}} +\@writefile{toc}{\contentsline {subsection}{\numberline {4.6}Process ID Tracking}{6}} +\newlabel{Process ID Tracking}{{4.6}{6}} +\@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces \relax \fontsize {9}{11}\selectfont \abovedisplayskip 8.5\p@ plus3\p@ minus4\p@ \abovedisplayshortskip \z@ plus2\p@ \belowdisplayshortskip 4\p@ plus2\p@ minus2\p@ \def \leftmargin \leftmargini \topsep 4\p@ plus2\p@ minus2\p@ \parsep 2\p@ plus\p@ minus\p@ \itemsep \parsep {\leftmargin \leftmargini \topsep 4\p@ plus2\p@ minus2\p@ \parsep 2\p@ plus\p@ minus\p@ \itemsep \parsep }\belowdisplayskip \abovedisplayskip \itshape Rough Sketch of Communication}}{6}} +\newlabel{fig-communication}{{1}{6}} +\@writefile{toc}{\contentsline {subsubsection}{\numberline {4.6.1}event\_data Structure Tracking}{6}} +\newlabel{event_data Structure Tracking}{{4.6.1}{6}} +\@writefile{toc}{\contentsline {subsection}{\numberline {4.7}Run Patterns}{6}} +\newlabel{Run Patterns}{{4.7}{6}} +\@writefile{toc}{\contentsline {subsection}{\numberline {4.8}Locating Performance Bottlenecks}{6}} +\newlabel{Locating Performance Bottlenecks}{{4.8}{6}} \@writefile{toc}{\contentsline {section}{\numberline {5}Intuition Confirm/Change}{6}} \newlabel{Intuition Confirm/Change}{{5}{6}} \@writefile{toc}{\contentsline {subsection}{\numberline {5.1}Characterizations of Different Packet Types}{6}} @@ -116,3 +128,4 @@ \bibcite{Roselli2000}{12} \bibcite{Vogels1999}{13} \bibcite{Meyer2012}{14} +\bibcite{PFRING}{15} diff --git a/TracingPaper.log b/TracingPaper.log index 4ef7f30..2cc0e6e 100644 --- a/TracingPaper.log +++ b/TracingPaper.log @@ -1,4 +1,4 @@ -This is pdfTeX, Version 3.1415926-2.5-1.40.14 (MiKTeX 2.9 64-bit) (preloaded format=pdflatex 2014.12.20) 20 DEC 2014 16:52 +This is pdfTeX, Version 3.1415926-2.5-1.40.14 (MiKTeX 2.9 64-bit) (preloaded format=pdflatex 2014.12.20) 21 DEC 2014 17:53 entering extended mode **TracingPaper.tex (C:\UConn\TracingPaper\TracingPaper.tex @@ -85,8 +85,17 @@ LaTeX Warning: Unused global option(s): (C:\UConn\TracingPaper\TracingPaper.aux +LaTeX Warning: Label `Run Patterns' multiply defined. + + +LaTeX Warning: Label `Locating Performance Bottlenecks' multiply defined. + + LaTeX Warning: Label `Ellard Ledlie 2003' multiply defined. + +LaTeX Warning: Label `Ellard 2003' multiply defined. + ) LaTeX Font Info: Checking defaults for OML/cmm/m/it on input line 54. LaTeX Font Info: ... okay on input line 54. @@ -165,31 +174,28 @@ File: omsptm.fd LaTeX Font Info: Font shape `OMS/ptm/m/n' in size <10> not available (Font) Font shape `OMS/cmsy/m/n' tried instead on input line 108. [2] [3] -LaTeX Font Info: Font shape `OT1/ptm/bx/it' in size <10> not available -(Font) Font shape `OT1/ptm/b/it' tried instead on input line 164. - -Underfull \hbox (badness 1117) in paragraph at lines 164--168 -\OT1/ptm/b/it/10 pcap2ds\OT1/ptm/m/n/10 : The pcap2ds pro-gram reads the con-te -nts - [] - - -Underfull \vbox (badness 2828) has occurred while \output is active [] +Underfull \vbox (badness 10000) has occurred while \output is active [] [4] - +LaTeX Font Info: Font shape `OT1/ptm/bx/it' in size <10> not available +(Font) Font shape `OT1/ptm/b/it' tried instead on input line 185. + + File: communications_sketch.png Graphic file (type png) -Package pdftex.def Info: communications_sketch.png used on input line 200. +Package pdftex.def Info: communications_sketch.png used on input line 207. (pdftex.def) Requested size: 180.67499pt x 180.89331pt. + +Underfull \vbox (badness 10000) has occurred while \output is active [] + + [5] Missing character: There is no â in font ptmr7t! Missing character: There is no † in font ptmr7t! Missing character: There is no ’ in font ptmr7t! Missing character: There is no â in font ptmr7t! Missing character: There is no † in font ptmr7t! Missing character: There is no ’ in font ptmr7t! - [5 ] Missing character: There is no â in font ptmr7t! Missing character: There is no € in font ptmr7t! Missing character: There is no ś in font ptmr7t! @@ -200,9 +206,7 @@ Missing character: There is no Missing character: There is no € in font ptmr7t! Missing character: There is no ™ in font ptmr7t! -Underfull \vbox (badness 10000) has occurred while \output is active [] - - [6] +[6 ] Underfull \vbox (badness 10000) has occurred while \output is active [] @@ -229,28 +233,28 @@ Missing character: There is no Missing character: There is no ť in font ptmr7t! [8] -Underfull \hbox (badness 1215) in paragraph at lines 364--365 +Underfull \hbox (badness 1215) in paragraph at lines 371--372 []\OT1/ptm/m/n/10 4. Dump-cap has op-tion called \OT1/ptm/m/it/10 snaplength \O T1/ptm/m/n/10 to [] [9] -Underfull \hbox (badness 10000) in paragraph at lines 401--402 +Underfull \hbox (badness 10000) in paragraph at lines 408--409 []\OT1/ptm/m/it/10 Common In-ter-net File Sys-tem (CIFS) Pro- [] -Underfull \hbox (badness 10000) in paragraph at lines 401--402 +Underfull \hbox (badness 10000) in paragraph at lines 408--409 \OT1/ptm/m/it/10 to-col\OT1/ptm/m/n/10 , urlhttp://msdn.microsoft.com/en- [] -Underfull \hbox (badness 10000) in paragraph at lines 403--404 +Underfull \hbox (badness 10000) in paragraph at lines 410--411 []\OT1/ptm/m/it/10 Server Mes-sage Block (SMB) Pro-to- [] -Underfull \hbox (badness 10000) in paragraph at lines 403--404 +Underfull \hbox (badness 10000) in paragraph at lines 410--411 \OT1/ptm/m/it/10 col\OT1/ptm/m/n/10 , urlhttp://msdn.microsoft.com/en- [] @@ -262,20 +266,20 @@ LaTeX Warning: There were multiply-defined labels. ) Here is how much of TeX's memory you used: - 1492 strings out of 493705 - 20098 string characters out of 3144575 - 81963 words of memory out of 3000000 - 4816 multiletter control sequences out of 15000+200000 + 1494 strings out of 493705 + 19998 string characters out of 3144575 + 81983 words of memory out of 3000000 + 4818 multiletter control sequences out of 15000+200000 22634 words of font info for 45 fonts, out of 3000000 for 9000 1025 hyphenation exceptions out of 8191 - 34i,8n,21p,2810b,437s stack positions out of 5000i,500n,10000p,200000b,50000s + 34i,8n,21p,1884b,437s stack positions out of 5000i,500n,10000p,200000b,50000s {C:/Program Files/MiKTeX 2.9/fonts/enc/dvips/fontname/8r.enc} -Output written on TracingPaper.pdf (10 pages, 4043816 bytes). +Output written on TracingPaper.pdf (10 pages, 4041247 bytes). PDF statistics: 63 PDF objects out of 1000 (max. 8388607) 0 named destinations out of 1000 (max. 500000) diff --git a/TracingPaper.pdf b/TracingPaper.pdf index e7fe898..3b63463 100644 Binary files a/TracingPaper.pdf and b/TracingPaper.pdf differ diff --git a/TracingPaper.synctex.gz b/TracingPaper.synctex.gz index d608727..9eb855b 100644 Binary files a/TracingPaper.synctex.gz and b/TracingPaper.synctex.gz differ diff --git a/TracingPaper.tex b/TracingPaper.tex index 30e43bc..6c5e9d7 100644 --- a/TracingPaper.tex +++ b/TracingPaper.tex @@ -131,63 +131,70 @@ This paper shows that the technology being actively researched gains improvement \subsection{The Need for a New Study} \label{The Need for a New Study} -\textbf{Make use of Leung Paper - 'Table 2: Summary of major file system studies over the past two decades'.} As has been pointed out by past work, the design of systems is usually guided by an understanding of the file system workloads and user behavior~\cite{Leung2008}. It is for that reason that new studies are constantly performed by the science community, from large scale studies to individual protocol studies~\cite{Leung2008,Ellard2003,Anderson2004,Roselli2000,Vogels1999}. Even within these studies, the information gleaned is only as meaningful as the considerations of how the data is handled. The following are issues that our work hopes to alleviate: there has been no large scale study done on networks for some time, there has been no study on CIFS(Common Internet File System)/SMB(Server Message Block) protocols for even longer, and most importantly these studies have not tackled the spacial \& temporal scaling idiosyncrasies of network communication. It is for these reasons that we have developed this tracing system and have developed new studies for temporal scaling. This was done through process ID tracking which is further explained in section~\ref{Process ID Tracking}. +As has been pointed out by past work, the design of systems is usually guided by an understanding of the file system workloads and user behavior~\cite{Leung2008}. It is for that reason that new studies are constantly performed by the science community, from large scale studies to individual protocol studies~\cite{Leung2008,Ellard2003,Anderson2004,Roselli2000,Vogels1999}. Even within these studies, the information gleaned is only as meaningful as the considerations of how the data is handled. The following are issues that our work hopes to alleviate: there has been no large scale study done on networks for some time, there has been no study on CIFS(Common Internet File System)/SMB(Server Message Block) protocols for even longer, and most importantly these studies have not tackled lower level aspects of the trace, such as spacial \& temporal scaling idiosyncrasies of network communication. It is for these reasons that we have developed this tracing system and have developed new studies for lower level aspects of communication network. A detailed overview of the tracings and analysis system can be seen in section ~\ref{Tracing System}. The hope is to further the progress made with benchmarks \& tracing in the hope that it too will lend to improbvng and deepening the knowledge and understanding of these systems so that as a result the technology and methodology is bettered as a whole. \section{Methodology} \label{Methodology} -\subsection{Effects of System Setup on Tracing} -\label{Effects of System Setup on Tracing} -When initially designing the tracing system used in this paper, different aspects were taken into account, such as space limitations of the tracing system, packet capture limitations (e.g. file size), and speed limitations of the hardware. The major space limitation that is dealt with in this work is the amount of space that the system has for storing the captured packets, including the resulting ds-file compressions. The limitation encountered in the packet capture system deals with the functional pcap (packet capture file) size, found to be about 750MB. \textit{When attempting to run tshark with larger pcap files (such as 1GB) it was found that once the program ran for some time (typically about 772 files) it would crash (often due to a stack smashing error). Unfortunately the cause of this error has yet to be discovered.} The speed limitations of the hardware are dictated by the hardware being used (e.g. GB capture interface) and the software that makes use of this hardware (e.g. PF\_RING). After all, our data can only be as accurate as the information being captured. +\subsection{System Limitations} +\label{System Limitations} +When initially designing the tracing system used in this paper, different aspects were taken into account, such as space limitations of the tracing system, packet capture limitations (e.g. file size), and speed limitations of the hardware. The major space limitation that is dealt with in this work is the amount of space that the system has for storing the captured packets, including the resulting DataSeries-file compressions. One limitation encountered in the packet capture system deals with the functional pcap (packet capture file) size. The concern being that the pcap files only need to be held until they have been filtered for specific protocol information and then compressed using the DataSeries format, but still allow for room for the DataSeries files being created to be stored. Other limitation concerns came from the software and packages used to collect the network traffic data~\cite{Orosz2013,Dabir2008,Skopko2012}. These ranged from timestamp resolution provided by the tracing system's kernel~\cite{Orosz2013} to how the packet capturing drivers and programs (such as dumpcap and tshark) operate along with how many copies are performed and how often. These aspects were tackled by installing PF\_RING, which is a kernel module which allows for kernel-based capture and sampling with the idea that this will limit packets loss and timestamp overhead leading to faster packet capture while efficiently preserving CPU cycles~\cite{PFRING}. The speed limitations of the hardware are dictated by the hardware being used (e.g. GB capture interface) and the software that makes use of this hardware (e.g. PF\_RING). After all, our data can only be as accurate as the information being captured~\cite{Ellard2003,Anderson2004}. +Other concerns deal with the whether or not the system would be able to function optimally during periods of high network traffic. All apsects of the system, from the hardware to the software, have been altered to help combat these concerns and allow for the most accurate packet capturing possible. \subsection{Main Challenges} \label{Main Challenges} -Challenges include: Interpretation of data, selective importance of information, arbitrary distribution of collected information +Challenges include: Interpretation of data, selective importance of information, arbitrary distribution of collected information. +One glaring challenge with building this tracing system was using code written by others; tshark \& DataSeries. While these programs are used within the tracing structure (which will be further examined in section ~\ref{Tracing System}) there are some issues when working with them. These issues ranged from data type limitations of the code to hash value \& checksum miscalculations due to encryption of specific fields/data. Attempt was made to dig and correct these issues, but they were so inherrent to the code being worked with that hacks and workaround were developed to minimize their effect. Other challenges centralize around selection, intrepretations and distribution scope of the data collected. Which fields should be filtered out from the original packet capture? What data is most prophetic to the form and function of the network being traced? What should be the scope, with respect to time, of the data being examined? Where will the most interesting information appear? As each obstacle was tackled, new information and ways of examining the data reveal themselves and with each development different alterations \& corrections are made. \subsection{Interpretation of Data} \label{Interpretation of Data} -Unfortunately benchmarks require that the person(s) creating the benchmark determines the interpretation of the data collected. To some degree these interpretations are easy to make (e.g. file system behavior \& user behavior~\cite{Leung2008}) while others are more complicated (e.g. temporal scaling of occurances of read/write), but in all scenarios there is still the requirment for human interpretation of the data. While having humans do the interpretations can be adventageous, a lack of all the "background" information can also lead to incorrectly interpreting the information. The hope of this project is that, despite the possible pitfall of incorrect data interpretation, we will be able to not only find out more about the workings and uses of a network but also produce a meaningful benchmark that will more accurately represent the spacial and temporal aspects of large communication networks. +Unfortunately benchmarks require that the person(s) creating the benchmark determines the interpretation of the data collected. To some degree these interpretations are easy to make (e.g. file system behavior \& user behavior~\cite{Leung2008}) while others are more complicated (e.g. temporal scaling of occurances of read/write), but in all scenarios there is still the requirment for human interpretation of the data. While having humans do the interpretations can be adventageous, a lack of all the "background" information can also lead to incorrectly interpreting the information. The hope of this project is that, despite the possible pitfall of incorrect data interpretation, we will be able to not only find out more about the workings and uses of a network but also produce a meaningful benchmark that will more accurately represent the low level aspects of large communication networks. -\subsection{Selective Importance of Information} -\label{Selective Importance of Information} -Expanding on the previous point about interpretation of data, another human factor of benchmark creation is selecting which information is important or which information will give the greatest insight to the workings on the network. As stated earlier too little information can lead to incorrect conclusions being drawn about the workings on the system, while too much information (and not knowing which information is pertinent) can lead to erroneous conclusions as well. Thus there is a need to strike a balance between what information is important enough to capture (so as not to slow down the capturing process through needless processing) while still obtaining enough information to acquire the bigger picture of what is going on. Unfortunately every step of the tracing process requires a degree of human input to decide what network information will end up providing the most complete picture of the network communication and how to interpret that data into meaningful graphs and tables. +\subsection{Scope of Interpretation} +\label{Scope of Interpretation} +Expanding on the previous point about interpretation of data, another human factor of benchmark creation is selecting which information is important or which information will give the greatest insight to the workings on the network. As stated earlier too little information can lead to incorrect conclusions being drawn about the workings on the system, while too much information (and not knowing which information is pertinent) can lead to erroneous conclusions as well. Thus there is a need to strike a balance between what information is important enough to capture (so as not to slow down the capturing process through needless processing) while still obtaining enough information to acquire the bigger picture of what is going on. Unfortunately every step of the tracing process requires a degree of human input to decide what network information will end up providing the most complete picture of the network communication and how to interpret that data into meaningful graphs and tables. This can lead to either finds around the focus of the work being done, or even lead to discoveries of other phenomena that end up having far more impact on the overall performance of the system~\cite{Ellard2003}. -\subsection{Arbitrary Distribution of Collected Information} -\label{Arbitrary Distribution of Collected Information} -Even when all the information is collected and the most important data has been selected, there is still the issue of what lens should be used to view this information. In terms of the tracing data, the different lenses mainly deal with time: hour, day, week, month, semester, year, etc. Because the data being collected is from an active network, there will be differing activity depending on the time of day, week, and scholastic year. For example, although the first week or so of the year may contain a lot of traffic, this does not mean that trends of that period of time will occur for every week of the year (except perhaps the final week of the semester). The trends and habits of the network will change based on the time of year, time of day, and even depend on the exam schedule. For these reasons one will see different trends depending on the distribution of the data used for analysis, and the truly interesting examination of data requires looking at all different periods of time to see how all these factors play into the communications of the network. +Even when all the information is collected and the most important data has been selected, there is still the issue of what lens should be used to view this information. Because the data being collected is from an active network, there will be differing activity depending on the time of day, week, and scholastic year. For example, although the first week or so of the year may contain a lot of traffic, this does not mean that trends of that period of time will occur for every week of the year (except perhaps the final week of the semester). The trends and habits of the network will change based on the time of year, time of day, and even depend on the exam schedule. For these reasons one will see different trends depending on the distribution of the data used for analysis, and the truly interesting examination of data requires looking at all different periods of time to see how all these factors play into the communications of the network. \section{Tracing System} \label{Tracing System} -\subsection{Different Stages of Trace} -\label{Different Stages of Trace} -\textit{\textbf{tshark}}: The tshark program acts as a collection program for grabbing all of the redirected network traffic and saving this packet information into files (i.e. pcap files). In order to help minimize packet loss, as this represents lost data, the '-n' option is used so that network object name resolution is disabled, thus helping simplify the packet capturing process. \textbf{Note:} This section may need to be changed to deal with Dumpcap depending on how testing goes -\\\textit{\textbf{pcap2ds}}: The pcap2ds program reads the contents of each pcap file and rewrites the information in the DataSeries format (i.e. ds files). The most important aspect of this step is that while this re-formatting of information is occurring, there is also a compression of information taking place (i.e., the file is in a "zipped"/"tar-ed" form). Preliminary examination of the numbers show ~99\% compression. The key reason for this compression is that the pcap2ds program goes through the contents of the pcap file and only writes field information that the user believes to be important or useful; not all of the information that is sent through network communications is pertinent to our tracking of client-server interactions. Due to the fundamental nature of this work, there is no need to track every piece of information that is exchanged, only that information which illuminates the behavior of the clients \& servers that function over the network (e.g. read \& write transactions). -\\\textit{\textbf{inotify}}: The inotify program acts as a watchdog for the directory in which tshark is writing its pcap files. As each pcap file is "completed" (i.e., has been written to the full desired size: 750MB) inotify "sees" the 'closed after write' (i.e., a file is closed after writing to it) that occurs and calls pcap2ds on the newly finished pcap file. In order to do this inotify calls the fork\_test() function, where a fork is called and each child process prepares the arguments required for running pcap2ds with a certain protocol (e.g. SMB, NFS, iSCSI), then runs that instance of pcap2ds. It should be noted that while the system is capable of performing pcap2ds using SMB, NFS \& iSCSI protocols, the system currently only deals with the SMB/CIFS protocol. While these forked pcap2ds instances run, inotify continues to monitor the pcap file directory so as not to miss any of the incoming information. -\\\textit{\textbf{analysis}}: The analysis program is used to analyze the data collected in order to gleam meaningful information from the traces captured. Currently this code is used to find the IAT (inter-arrivale time) between requests and responses for given command packets sent over the network. This analysis will eventually incorporate oplocks and other aspects of resource sharing on the network to gain a more complete picture of the network's usage and bottlenecks. +\subsection{Stages of Trace} +\label{Stages of Trace} -\subsection{About the Systems Being Traced} -\label{About the Systems Being Traced} -\textit{\textbf{SMB Server, iSCSI Trace, ECS, etc.}}: The SMB/CIFS information being captured comes from the university network. All packet and transaction information is passed through a duplicating switch(\textit{pipe?}) that then allows for the tracing system to capture these packet transactions over a 10 GB(\textit{bytes?bit?}) port. The reason for using 10GB/b hardware is to help ensure that the system is able to capture any \& all information on the network -\\\textit{\textbf{Expectations}}: SMB will be heavily used by students to access their network accounts from any networked computer, along with network access to shared file systems and connected printers. Oplocks will be in heavy use and cause a slowdown of the system for multiuser shared storage space. Authentication of network computers could bottleneck during moments of high traffic (e.g. students all logging in for a class). -\\\textit{\textbf{Potential Difficulties}}: Unable to capture all of the traffic occurring, being able to preform packet analysis (pcap2ds) at a speed greater than or equal to the speed of the incoming information. To some degree the system attempts to alleviate this issue the intial capturing process (i.e. Dumpcap) is writing to a RAMDisk to as to not have the writing of the packets play the limiting factor of the trace. It should be noted that while the pcap2ds code reads from the RAMDisk it is writing to the SSD which has a signifcantly slower write speed. +\subsubsection{Capture} +\label{Capture} +The packet capturing aspect of the tracing system is fairly straight forward. On top of the previously mentioned alterations to the system (e.g. PF\_RING), the capture of packets is done through the use of \textit{tshark}, \textit{pcap2ds}, and \textit{inotify} programs. The broad strokes are that incoming SMB/CIFS information comes from the university's network. All packet and transaction information is passed through a duplicating switch that then allows for the tracing system to capture these packet transactions over a 10 Gb port. The reason for using 10Gb hardware is to help ensure that the system is able to capture and \& all information on the network. These packets are then passed along to the \textit{tshark} packet collection program (which is the terminal version of wireshark) which records these packets into a cyclical capturing ring. A watchdog program (called \textit{inotify}) watches the directory where all of these packet-capture (pcap) files are being stored and as a new pcap file is completed \textit{inotify} passes the file to \textit{pcap2ds} along with what protocol is being examined (i.e. SMB). The \textit{pcap2ds} program reads through the given pcap files, filters out any data fields deemed important or interesting for the passed protocol type, then the results are written in DataSeries format and these compressed files are then collected and stored. Due to the fundamental nature of this work, there is no need to track every piece of information that is exchanged, only that information which illuminates the behavior of the clients \& servers that function over the network (e.g. read \& write transactions). It should also be noted that all sensitive information being captured by the tracing system in encrypted to proect the users whose information is be examined by this tracing system. + +\subsubsection{Collection} +\label{Collection} +The collection of these files is rather straight forward. Once the DataSeries files have been collected to an arbitrary amount (in this case 100,000 files), these files are then moved off of the tracing system and are stored on a more secure \textit{/trace-store/} machine. This storage location is only accessable from the trace system along with RAIDing on its disk to protect against data loss of the collected DataSeries files. These files are then used in analysis to determine the behavior on the university network. + +\subsubsection{Dissection/Analysis} +\label{Dissection/Analysis} +The trace analysis in performed by an analysis module code that both processes the DataSeries files for extraction of information but also outputs meaningful information (such as IO patterns) to a file that can be used for further analysis. This section of the tracing system is always growing and changing as discoveries and limitations are found during the continuous execution of this code. Alterations range from edits to speed up the analysis process to adjustments to how communications are tracked and interpreted. This analysis will eventually incorporate oplocks and other aspects of resource sharing on the network to gain a more complete picture of the network's usage and bottlenecks. \section{Trace Analysis} \label{Trace Analysis} The trace analysis is performed by an AnalysisModule code that both processes the ds-files for extraction of information to an event\_data structure and also outputs meaningful information (such as the IAT times between request and response packets) to a file that can be used for further analysis. -\subsection{Request/Response Command Pairing} -\label{Request/Response Command Pairing} -All comands sent over the network are coupled to an identifying MID/PID/TID/UID tuple. Since the only commands being examined are read or write commands, the identifying characteristic distinguishing a request command packet from a reponse command packet is the addition of an FID field with the sent packet. It is examination of the packets for this FID field that allows the analysis code to distinguish between request \& response command pakets. The pairing is done by examining the identifying tuple and assuming that each tuple-identified system will only send one command at a time (awaiting a response before sending the next command of that same type). - -\subsection{IAT Calculation for Request/Response Pairs} -\label{IAT Calculation for Request/Response Pairs} -The act of Response/Request (ResReq) IAT calculation is performed during the updating of the response-packet tracking within the event\_data structure. The IAT is calculated by finding the difference between the last request packet time (for the identified tuple) \& the time stamp of the currently recorded response packet. Since the time stamps for both the request and response packets are those belonging to the same identifying tuple, then we can assume that the IAT calculated is that of a single request-response pairing. It should be noted that in the scenario of a response packet occuring \textit{before} a request packet has been logged (e.g. not having the data from the originating request packet), the "IAT calculated" is placed in the "OTHER" bucket because the calculated IAT would produce eroneous data (e.g. N/A [0] - Response Time). - \subsection{SMB} \label{SMB} Server Message Block (SMB) is the modern dialect of Common Internet File System (CIFS). The most important aspect of SAMBA (e.g. SMB) is that it is a stateful protocol , i.e. one where the information being sent via SMB has identifying fields that allow for process ID tracking. -\\The structure for sending message payloads in SMB is as follows: each SMB message is split into three blocks. The first block is a fixed-length SMB header. The second block is made up of two variable-length blocks called the SMB parameters. The third block is made up of the SMB data. Depending on the transaction occurring these different blocks are used in different manners. For example, the SMB protocol dictates that error responses \textbf{should} be sent with empty SMB parameters \& SMB data blocks (along with the WordCount \& ByteCount fields set to zero). The purpose of the SMB header is particularly important because the header identifies the message as an SMB message payload~\cite{MS-CIFS}. When used in a response message the header also includes status information that indicates whether and how the command succeeded or failed. The most important aspects of the SMB header, which the tracing system constantly examines, are the PID/MID tuple (for the purpose of identifying a client/server) and the commands value which is passed (notifying our tracing system of the actions taking place on the network). It is through this command field that the process ID tracking system is able to follow the different commands (read/write/general event) that occur and try to find patterns in these network communications. +\\The structure for sending message payloads in SMB is as follows: each SMB message is split into three blocks. The first block is a fixed-length SMB header. The second block is made up of two variable-length blocks called the SMB parameters. The third block is made up of the SMB data. Depending on the transaction occurring these different blocks are used in different manners. The purpose of the SMB header is particularly important because the header identifies the message as an SMB message payload~\cite{MS-CIFS}. When used in a response message the header also includes status information that indicates whether and how the command succeeded or failed. The most important aspects of the SMB header, which the tracing system constantly examines, are the PID/MID tuple (for the purpose of identifying a client/server) and the commands value which is passed (notifying our tracing system of the actions taking place on the network). It is through this command field that the process ID tracking system is able to follow the different commands (read/write/general event) that occur and try to find patterns in these network communications. +\\\textit{\textbf{Expectations}}: SMB will be heavily used by students to access their network accounts from any networked computer, along with network access to shared file systems and connected printers. Oplocks will be in heavy use and cause a slowdown of the system for multiuser shared storage space. Authentication of network computers could bottleneck during moments of high traffic (e.g. students all logging in for a class). + +\subsection{ID Tracking} +\label{ID Tracking} +All comands sent over the network are coupled to an identifying MID/PID/TID/UID tuple. Since the only commands being examined are read or write commands, the identifying characteristic distinguishing a request command packet from a reponse command packet is the addition of an FID field with the sent packet. It is examination of the packets for this FID field that allows the analysis code to distinguish between request \& response command pakets. The pairing is done by examining the identifying tuple and assuming that each tuple-identified system will only send one command at a time (awaiting a response before sending the next command of that same type). +\\Following these process IDs is as a way to check for intercommunication between two or more processes. In particular, we examine the compute time \& I/O (input/output) time (i.e. time spent in communication; between information arrivals). This is done by examining the inter-arrival times (IAT) between the server \& the client. This is interesting because this information will give us a realistic sense of the data transit time of the network connections being used (e.g. ethernet, firewire, fibre, etc.). Other pertinent information would be how often the client makes requests \& how often this event occurs per client process ID, identifiable by their PID/MID tuple. One could also track the amount of sharing that is occurring between users. The PID is the process identifier and the MID is the multiplex identifier, which is set by the client and is to be used for identifying groups of commands belonging to the same logical thread of operation on the client node. +\\The per client process ID can be used to map the activity of given programs, thus allowing for finer granularity in the produced benchmark (e.g. control down to process types ran by individual client levels). Other features of interest are the time between an open \& close, or how many opens/closes occurred in a window (e.g. a period of time). This information could be used as a gauge of current day trends in filesystem usage \& its consequent taxation on the surrounding network. It would also allow for greater insight on the r/w habits of users on a network along with a rough comparison between other registered events that occur on the network. Lastly, though no less important, it would allow us to look at how many occurrences there are of shared files between different users, though one must note that there is some issue (though hopefully rare) of resource locking (e.g. shared files) that needs to be taken into account. This is initially addressed by monitoring any oplock flags that are sent for read \& writes. This information also helps provide a preliminary mapping of how the network is used and what sort of traffic populates the communication. + +\subsection{Run Patterns} +\label{Run Patterns} + +\subsection{Locating Performance Bottlenecks} +\label{Locating Performance Bottlenecks} \subsection{Other (e.g. HTML)} \label{Other (e.g. HTML)} @@ -204,9 +211,9 @@ Server Message Block (SMB) is the modern dialect of Common Internet File System \end{centering} \end{figure} -Following these process IDs is as a way to check for intercommunication between two or more processes. In particular, we examine the compute time \& I/O (input/output) time (i.e. time spent in communication; between information arrivals). This is done by examining the inter-arrival times (IAT) between the server \& the client. This is interesting because this information will give us a realistic sense of the data transit time of the network connections being used (e.g. ethernet, firewire, fibre, etc.). Other pertinent information would be how often the client makes requests \& how often this event occurs per client process ID, identifiable by their PID/MID tuple. One could also track the amount of sharing that is occurring between users. The PID is the process identifier and the MID is the multiplex identifier, which is set by the client and is to be used for identifying groups of commands belonging to the same logical thread of operation on the client node. Tracking the IAT is interesting because we want to know the activity of the client (i.e. how many connections/connection requests each client is producing) which can be used to map behavior for low, medium \& high level clients (i.e. amount of traffic being produced) for use in an adaptive benchmarking system. The per client process ID can be used to map the activity of given programs, thus allowing for finer granularity in the produced benchmark (e.g. control down to process types ran by individual client levels). Figure~\ref{fig-communication} shows a rough sketch of communication between a client \& server. The general order that constitutes a full tracking is as follows: (client) computation [process to filesystem], (client) communication [SMB protocol used to send data client→server], (server) timestamping + service [server gets data, logs it, performs service], (server) communication [SMB data send server→client], (client) next computation. Other features of interest are the time between an open \& close, or how many opens/closes occurred in a window (e.g. a period of time). This information could be used as a gauge of current day trends in filesystem usage \& its consequent taxation on the surrounding network. It would also allow for greater insight on the r/w habits of users on a network along with a rough comparison between other registered events that occur on the network. Lastly, though no less important, it would allow us to look at how many occurrences there are of shared files between different users, though one must note that there is some issue (though hopefully rare) of resource locking (e.g. shared files) that needs to be taken into account. This is initially addressed by monitoring any oplock flags that are sent for read \& writes. +Figure~\ref{fig-communication} shows a rough sketch of communication between a client \& server. The general order that constitutes a full tracking is as follows: (client) computation [process to filesystem], (client) communication [SMB protocol used to send data client→server], (server) timestamping + service [server gets data, logs it, performs service], (server) communication [SMB data send server→client], (client) next computation. -Currently the focus of process ID tracking is to see the number of reads, writes and events that occur due to the actions of clients on the network. This is done by using a tuple of the PID \& MID fields which allows for the identification of client. Since these values are unique and \textbf{MUST} be sent with each packet, this tuple is used as the key for the unordered map that is used to track this information. The structure is as follows: the tuple functions as the key for the pairing of the identifying tuple \& corresponding event\_data structure; which is used to house pertinent information about reads/writes/events. The information stored in the structure is the last time a read/write/event occurred, the total IAT of the observed read/write/events, and the total number of reads/writes/events that have occurred for the identified tuple. The purpose for tracking this information is to profile the read/write “habits” of the users on the network as well as comparing this information against the general events’ inter-arrival times, thus allowing one to see if the read \& write events are being processed differently (e.g. longer or shorter IATs) than the rest of the events occurring on the network. This information also helps provide a preliminary mapping of how the network is used and what sort of traffic populates the communication. +Currently the focus of process ID tracking is to see the number of reads, writes and events that occur due to the actions of clients on the network. This is done by using a tuple of the PID \& MID fields which allows for the identification of client. Since these values are unique and \textbf{MUST} be sent with each packet, this tuple is used as the key for the unordered map that is used to track this information. The structure is as follows: the tuple functions as the key for the pairing of the identifying tuple \& corresponding event\_data structure; which is used to house pertinent information about reads/writes/events. The information stored in the structure is the last time a read/write/event occurred, the total IAT of the observed read/write/events, and the total number of reads/writes/events that have occurred for the identified tuple. The purpose for tracking this information is to profile the read/write “habits” of the users on the network as well as comparing this information against the general events’ inter-arrival times, thus allowing one to see if the read \& write events are being processed differently (e.g. longer or shorter IATs) than the rest of the events occurring on the network. One should note that there are separate purposes to the PID/MID tuple from the PID/MID/TID/UID tuple. The first tuple (2-tuple) is used to uniquely identify groups of commands belonging to the same logical thread of operation on the client node, while the latter tuple (4-tuple) allows for unique identification for request \& responses that are part of the same transaction. While the PID/MID tuple is mainly what we are interested in, since this allows the following of a single logical thread, there is some interest in making use of the TID/UID tuple because this would allow us to count the number of transactions that occur in a single logical thread. This information could provide interesting information on how the computer systems on the network may be deciding to handle/send commands over the network; e.g. sending multiple commands per transaction, multiple packet commands per transaction, etc. @@ -413,6 +420,8 @@ File system usage in Windows NT 4.0}, Proceedings of the seventeenth ACM symposi \bibitem{Meyer2012} Dutch T. Meyer and William J. Bolosky, \emph{ A Study of Practical Deduplication}, ACM Transactions on Storage (January 2012) +\bibitem{PFRING} \emph{PF\_RING High-speed packet capture, filtering and analysis}, url{http://www.ntop.org/products/pf\_ring/} + \end{thebibliography} \end{document}