diff --git a/TracingPaper.aux b/TracingPaper.aux index a173217..fea08f0 100644 --- a/TracingPaper.aux +++ b/TracingPaper.aux @@ -1,7 +1,6 @@ \relax \citation{Anderson2004} \citation{Leung2008} -\citation{Anderson2004} \citation{Orosz2013} \citation{Dabir2008} \citation{Skopko2012} @@ -36,14 +35,14 @@ \newlabel{Introduction}{{1}{1}} \@writefile{toc}{\contentsline {subsection}{\numberline {1.1}Issues with Tracing}{1}} \newlabel{Issues with Tracing}{{1.1}{1}} +\@writefile{toc}{\contentsline {subsection}{\numberline {1.2}Previous Advances Due to Testing}{1}} +\newlabel{Previous Advances Due to Testing}{{1.2}{1}} \citation{Orosz2013} \citation{Skopko2012} \citation{PFRING} \citation{PFRINGMan} \citation{PFRING} \citation{PFRINGMan} -\@writefile{toc}{\contentsline {subsection}{\numberline {1.2}Previous Advances Due to Testing}{2}} -\newlabel{Previous Advances Due to Testing}{{1.2}{2}} \@writefile{toc}{\contentsline {subsection}{\numberline {1.3}Contributions}{2}} \newlabel{Contributions}{{1.3}{2}} \@writefile{toc}{\contentsline {section}{\numberline {2}Trace Collection}{2}} @@ -83,6 +82,8 @@ \citation{Bolosky2007} \citation{EllardLedlie2003} \bibcite{Leung2008}{1} +\bibcite{Ellard2003}{2} +\bibcite{EllardLedlie2003}{3} \@writefile{toc}{\contentsline {subsection}{\numberline {3.3}Locating Performance Bottlenecks}{6}} \newlabel{Locating Performance Bottlenecks}{{3.3}{6}} \@writefile{toc}{\contentsline {section}{\numberline {4}Intuition Confirm/Change}{6}} @@ -91,8 +92,6 @@ \newlabel{Characterizations of Different Packet Types}{{4.1}{6}} \@writefile{toc}{\contentsline {section}{\numberline {5}Conclusion}{6}} \newlabel{Conclusion}{{5}{6}} -\bibcite{Ellard2003}{2} -\bibcite{EllardLedlie2003}{3} \bibcite{Anderson2004}{4} \bibcite{Orosz2013}{5} \bibcite{Dabir2008}{6} diff --git a/TracingPaper.log b/TracingPaper.log index 46efb4e..56dba48 100644 --- a/TracingPaper.log +++ b/TracingPaper.log @@ -1,4 +1,4 @@ -This is pdfTeX, Version 3.1415926-2.3-1.40.12 (MiKTeX 2.9 64-bit) (preloaded format=pdflatex 2012.11.13) 20 MAR 2015 10:25 +This is pdfTeX, Version 3.1415926-2.3-1.40.12 (MiKTeX 2.9 64-bit) (preloaded format=pdflatex 2012.11.13) 20 MAR 2015 12:38 entering extended mode **C:/Users/rundeMT/Documents/UConn/TracingPaper/TracingPaper.tex (C:/Users/rundeMT/Documents/UConn/TracingPaper/TracingPaper.tex @@ -149,6 +149,11 @@ Underfull \hbox (badness 10000) in paragraph at lines 79--84 [] + +Underfull \hbox (badness 10000) in paragraph at lines 89--92 + + [] + Missing character: There is no â in font ptmr7t! Missing character: There is no € in font ptmr7t! Missing character: There is no ś in font ptmr7t! @@ -156,107 +161,96 @@ Missing character: There is no Missing character: There is no € in font ptmr7t! Missing character: There is no ť in font ptmr7t! -Underfull \hbox (badness 10000) in paragraph at lines 105--109 +Underfull \hbox (badness 10000) in paragraph at lines 105--108 [] +[1{C:/ProgramData/MiKTeX/2.9/pdftex/config/pdftex.map} -Underfull \vbox (badness 1436) has occurred while \output is active [] - - [1{C:/ProgramData/MiKTeX/2.9/pdftex/config/pdftex.map} - -] - -LaTeX Warning: Reference `Tracing System' on page 2 undefined on input line 110 -. - -[2] +] [2] LaTeX Font Info: Font shape `OT1/ptm/bx/it' in size <10> not available -(Font) Font shape `OT1/ptm/b/it' tried instead on input line 146. +(Font) Font shape `OT1/ptm/b/it' tried instead on input line 145. [3] -Underfull \hbox (badness 2042) in paragraph at lines 159--161 +Underfull \hbox (badness 2042) in paragraph at lines 158--160 \OT1/ptm/m/n/10 been fil-tered for spe-cific pro-to-col in-for-ma-tion and [] -Underfull \hbox (badness 1552) in paragraph at lines 159--161 +Underfull \hbox (badness 1552) in paragraph at lines 158--160 \OT1/ptm/m/n/10 to how the packet cap-tur-ing drivers and pro-grams [] -Underfull \hbox (badness 10000) in paragraph at lines 159--161 +Underfull \hbox (badness 10000) in paragraph at lines 158--160 [] [4] -Underfull \hbox (badness 10000) in paragraph at lines 207--208 +Underfull \hbox (badness 10000) in paragraph at lines 206--207 [] -LaTeX Font Info: Try loading font information for OMS+ptm on input line 214. +LaTeX Font Info: Try loading font information for OMS+ptm on input line 213. ("C:\Program Files\MiKTeX 2.9\tex\latex\psnfss\omsptm.fd" File: omsptm.fd ) LaTeX Font Info: Font shape `OMS/ptm/m/n' in size <10> not available -(Font) Font shape `OMS/cmsy/m/n' tried instead on input line 214. +(Font) Font shape `OMS/cmsy/m/n' tried instead on input line 213. [5] -Underfull \hbox (badness 1077) in paragraph at lines 242--243 +Underfull \hbox (badness 1077) in paragraph at lines 241--242 \OT1/ptm/m/n/10 not only pull out in-for-ma-tion per-ta-nent to the [] [6] -Underfull \hbox (badness 10000) in paragraph at lines 284--285 +Underfull \hbox (badness 10000) in paragraph at lines 283--284 []\OT1/ptm/m/it/10 Common In-ter-net File Sys-tem (CIFS) Pro- [] -Underfull \hbox (badness 10000) in paragraph at lines 284--285 +Underfull \hbox (badness 10000) in paragraph at lines 283--284 \OT1/ptm/m/it/10 to-col\OT1/ptm/m/n/10 , urlhttp://msdn.microsoft.com/en- [] -Underfull \hbox (badness 10000) in paragraph at lines 286--287 +Underfull \hbox (badness 10000) in paragraph at lines 285--286 []\OT1/ptm/m/it/10 Server Mes-sage Block (SMB) Pro-to- [] -Underfull \hbox (badness 10000) in paragraph at lines 286--287 +Underfull \hbox (badness 10000) in paragraph at lines 285--286 \OT1/ptm/m/it/10 col\OT1/ptm/m/n/10 , urlhttp://msdn.microsoft.com/en- [] -Underfull \hbox (badness 10000) in paragraph at lines 301--303 +Underfull \hbox (badness 10000) in paragraph at lines 300--302 []\OT1/ptm/m/it/10 PF[]RING User Guide\OT1/ptm/m/n/10 , url- [] -Overfull \hbox (61.33023pt too wide) in paragraph at lines 301--303 +Overfull \hbox (61.33023pt too wide) in paragraph at lines 300--302 \OT1/ptm/m/n/10 https://svn.ntop.org/svn/ntop/trunk/PF[]RING/doc/UsersGuide.pdf [] -[7] (C:\Users\rundeMT\Documents\UConn\TracingPaper\TracingPaper.aux) - -LaTeX Warning: There were undefined references. - - ) +[7] (C:\Users\rundeMT\Documents\UConn\TracingPaper\TracingPaper.aux) ) Here is how much of TeX's memory you used: - 1477 strings out of 494049 - 19899 string characters out of 3146058 + 1476 strings out of 494049 + 19883 string characters out of 3146058 78651 words of memory out of 3000000 - 4769 multiletter control sequences out of 15000+200000 + 4768 multiletter control sequences out of 15000+200000 20443 words of font info for 42 fonts, out of 3000000 for 9000 715 hyphenation exceptions out of 8191 34i,8n,21p,2172b,435s stack positions out of 5000i,500n,10000p,200000b,50000s -{C:/Program Files/MiKTeX 2.9/fonts/enc/dvips/fontname/8r.enc} -Output written on TracingPaper.pdf (7 pages, 114508 bytes). +{C:/Progr +am Files/MiKTeX 2.9/fonts/enc/dvips/fontname/8r.enc} +Output written on TracingPaper.pdf (7 pages, 113725 bytes). PDF statistics: 51 PDF objects out of 1000 (max. 8388607) 0 named destinations out of 1000 (max. 500000) diff --git a/TracingPaper.pdf b/TracingPaper.pdf index 935a1fe..30b4b8c 100644 Binary files a/TracingPaper.pdf and b/TracingPaper.pdf differ diff --git a/TracingPaper.synctex.gz b/TracingPaper.synctex.gz index 1ce2799..2a9838e 100644 Binary files a/TracingPaper.synctex.gz and b/TracingPaper.synctex.gz differ diff --git a/TracingPaper.tex b/TracingPaper.tex index a072937..03ac870 100644 --- a/TracingPaper.tex +++ b/TracingPaper.tex @@ -88,7 +88,7 @@ Traces are important for the purpose of developing and taking accurate metrics o As seen in previous trace work done [Leung et al, Ellard et al, Roselli et al], the general perceptions of how computer systems are being used versus their initial purpose have allowed for great strides in eliminating actual bottlenecks rather than spending unnecessary time working on imagined bottlenecks. Leung's \textit{et. al.} work led to a series of obervations, from the fact that files are rarely re-opened to finding that read-write access patterns are more frequent ~\cite{Leung2008}. Without illumination of these underlying actions (e.g. read-write ratios, file death rates, file access rates) these issues can not be readily tackled. \\ -\textbf{NOT SURE IF KEEP OR NEEDED} I/O benchmarking, the process of comparing I/O systems by subjecting them to known workloads, is a widespread pratice in the storage industry and serves as the basis for purchasing decisions, performance tuning studies, and marketing campaigns ~\cite{Anderson2004}. +%\textbf{NOT SURE IF KEEP OR NEEDED} I/O benchmarking, the process of comparing I/O systems by subjecting them to known workloads, is a widespread pratice in the storage industry and serves as the basis for purchasing decisions, performance tuning studies, and marketing campaigns ~\cite{Anderson2004}. The purpose of my work is to tackle this gap and hopefully bring insight to the complexity of network communication through the examination of CIFS network traffic. @@ -97,21 +97,20 @@ The purpose of my work is to tackle this gap and hopefully bring insight to the \textbf{REWORD TO REMOVE MENTION OF BENCHMARKS}\\ The majority of benchmarks are attempts to represent a known system and structure on which some “original” design/system was tested. While this is all well and good, there are many issues with this sort of approach; temporal \& spatial scaling concerns, timestamping and buffer copying, as well as driver operation for capturing packets~\cite{Orosz2013,Dabir2008,Skopko2012}. Each of these aspects contribute to the inital problems with dissection and analysis of the captured information. For example, inaccuracies in scheduling I/Os may result in as much as a factor of 3.5 differences in measured response time and factor of 26 in measured queue sizes; differences that are too large to ignore~\cite{Anderson2004}. -Temporal scaling refers to the need to account for the nuances of timing with respect to the run time of commands; consiting of computation, communication \& service. A temporally scalable benchmarking system would take these subtleties into account when expanding its operation across multiple machines in a network. While these temporal issues have been tackled for a single processor (and even somewhat for cases of multi-processor), these same timing issues are not properly handles when dealing with inter-network communication. Inaccuracies in packet timestamping can be caused due to overhead in generic kernel-time based solutions, as well as use of the kernel data structures ~\cite{Orosz2013,PFRINGMan}.// +Temporal scaling refers to the need to account for the nuances of timing with respect to the run time of commands; consiting of computation, communication \& service. A temporally scalable benchmarking system would take these subtleties into account when expanding its operation across multiple machines in a network. While these temporal issues have been tackled for a single processor (and even somewhat for cases of multi-processor), these same timing issues are not properly handles when dealing with inter-network communication. Inaccuracies in packet timestamping can be caused due to overhead in generic kernel-time based solutions, as well as use of the kernel data structures ~\cite{Orosz2013,PFRINGMan}. \\ Spatial scaling refers to the need to account for the nuances of expanding a benchmark to incorporate a number of (\textbf{n}) machines over a network. A system that properly incorporates spatial scaling is one that would be able to incorporate communication (even in varying intensities) between all the machines on a system, thus stress testing all communicative actions and aspects (e.g. resource locks, queueing) on the network. \subsection{Previous Advances Due to Testing} \label{Previous Advances Due to Testing} -Tracing collection and analysis has proved its worth in time from previous studies where can be seen important lessons pulled from the research; change in behavior of read/write events, overhead concerns originating in system implementation, bottlenecks in communication, and other revelations found in the traces. Previous tracing work has shown that one of the largest \& broadest hurdles to tackle is that traces (and benchmarks) must be tailored (to every extent) to the system being tested. There are always some generalizations taken into account but these generalizations can also be a major source of error~\cite{Ellard2003,EllardLedlie2003,Anderson2004,Orosz2013,Dabir2008,Skopko2012,Vogels1999,Traeger2008,Ruemmler1993}. To produce a benchmark with high fidelity one needs to understand not only the technology being used but how it is being implemented within the system being traced \& benchmarked~\cite{Roselli2000,Traeger2008,Ruemmler1993}. All of these aspects will lend to the behavior of the system; from timing \& resource elements to how the managing software governs actions~\cite{Ellard2003,EllardLedlie2003,Douceur1999}. Further more, in pursuing this work one may find unexpected results and learn new things through examination~\cite{Leung2008,Ellard2003,Roselli2000}. \\ -These studies are required in order to evaluate the development of technologies and methodologies along with furthering knowledge of different system aspects and capabilities. -\\ -As has been pointed out by past work, the design of systems is usually guided by an understanding of the file system workloads and user behavior~\cite{Leung2008}. It is for that reason that new studies are constantly performed by the science community, from large scale studies to individual protocol studies~\cite{Leung2008,Ellard2003,Anderson2004,Roselli2000,Vogels1999}. Even within these studies, the information gleaned is only as meaningful as the considerations of how the data is handled. \\ +Tracing collection and analysis has proved its worth in time from previous studies where one can see important lessons pulled from the research; change in behavior of read/write events, overhead concerns originating in system implementation, bottlenecks in communication, and other revelations found in the traces. \\ +Previous tracing work has shown that one of the largest \& broadest hurdles to tackle is that traces (and benchmarks) must be tailored (to every extent) to the system being tested. There are always some generalizations taken into account but these generalizations can also be a major source of error~\cite{Ellard2003,EllardLedlie2003,Anderson2004,Orosz2013,Dabir2008,Skopko2012,Vogels1999,Traeger2008,Ruemmler1993}. To produce a benchmark with high fidelity one needs to understand not only the technology being used but how it is being implemented within the system being traced \& benchmarked~\cite{Roselli2000,Traeger2008,Ruemmler1993}. All of these aspects will lend to the behavior of the system; from timing \& resource elements to how the managing software governs actions~\cite{Ellard2003,EllardLedlie2003,Douceur1999}. Further more, in pursuing this work one may find unexpected results and learn new things through examination~\cite{Leung2008,Ellard2003,Roselli2000}. \\ +These studies are required in order to evaluate the development of technologies and methodologies along with furthering knowledge of different system aspects and capabilities. As has been pointed out by past work, the design of systems is usually guided by an understanding of the file system workloads and user behavior~\cite{Leung2008}. It is for that reason that new studies are constantly performed by the science community, from large scale studies to individual protocol studies~\cite{Leung2008,Ellard2003,Anderson2004,Roselli2000,Vogels1999}. Even within these studies, the information gleaned is only as meaningful as the considerations of how the data is handled. \\ -A detailed overview of the tracings and analysis system can be seen in section ~\ref{Tracing System}. The hope is to further the progress made with benchmarks \& tracing in the hope that it too will lend to improving and deepening the knowledge and understanding of these systems so that as a result the technology and methodology is bettered as a whole. +A detailed overview of the tracings and analysis system can be seen in section ~\ref{Trace Collection}. The hope is to further the progress made with benchmarks \& tracing in the hope that it too will lend to improving and deepening the knowledge and understanding of these systems so that as a result the technology and methodology is bettered as a whole. \subsection{Contributions} \label{Contributions} -Out of all the elements that make up the tracing system used for this research, there are a few key aspects that are worth covering due to their uniqueness within the system. These key components of the tracing system are the use of PF\_RING to mitigate timing and resource concerns, the use of proper hardware and software to handle incoming data, along with the tweaking of DataSeries code to create analysis tools for the captured data. +Out of all the elements that make up the tracing system used for this research, there are a few key aspects that are worth covering due to their uniqueness within the system. These key components of the tracing system are the use of PF\_RING to mitigate timing and resource concerns, the use of proper hardware and software to handle incoming data, along with the tweaking of DataSeries code to create analysis tools for the captured data. \\ % PF\_RING section The addition of PF\_RING lends to the tracing system by minimizing the copying of packets which, in turn, allows for more accurate timestamping of incoming traffic packets being captured ~\cite{Orosz2013,Skopko2012,PFRING,PFRINGMan}. PF\_RING acts as a kernel module which allows for kernel-based capture and sampling that limits packet loss and timestamping overhead leading to faster packet capture while efficiently preserving CPU cycles ~\cite{PFRING}. This aids in minimizing packet loss/timestamping issues by not passing packets through the kernel data structures~\cite{PFRINGMan}. The other reason PF\_RING is instrumental is that it functions with the 10Gb/s hardware that was installed into the Trace1 server; allowing for full throughput from the network tap on the UITS system. \\ % DataSeries + Code section @@ -160,14 +159,14 @@ When initially designing the tracing system used in this paper, different aspect An other concern was whether or not the system would be able to function optimally during periods of high network traffic. All apsects of the system, from the hardware to the software, have been altered to help combat these concerns and allow for the most accurate packet capturing possible. \\ %About Challenges of system -Challenges include: Interpretation of data, selective importance of information, arbitrary distribution of collected information. +While the limitations of the system were concerns, there were other challenges that were tackled in the development of this research. One glaring challenge with building this tracing system was using code written by others; tshark \& DataSeries. While these programs are used within the tracing structure there are some issues when working with them. These issues ranged from data type limitations of the code to hash value \& checksum miscalculations due to encryption of specific fields/data. Attempt was made to dig and correct these issues, but they were so inherrent to the code being worked with that hacks and workaround were developed to minimize their effect. Other challenges centralize around selection, intrepretations and distribution scope of the data collected. Which fields should be filtered out from the original packet capture? What data is most prophetic to the form and function of the network being traced? What should be the scope, with respect to time, of the data being examined? Where will the most interesting information appear? As each obstacle was tackled, new information and ways of examining the data reveal themselves and with each development different alterations \& corrections are made. %About interpretation of data To some degree these interpretations are easy to make (e.g. file system behavior \& user behavior~\cite{Leung2008}) while others are more complicated (e.g. temporal scaling of occurances of read/write), but in all scenarios there is still the requirment for human interpretation of the data. While having humans do the interpretations can be adventageous, a lack of all the "background" information can also lead to incorrectly interpreting the information. %About scope of interpretation (affect of time on data seen) -Another human factor of benchmark creation is selecting which information is important or which information will give the greatest insight to the workings on the network. Too little information can lead to incorrect conclusions being drawn about the workings on the system, while too much information (and not knowing which information is pertinent) can lead to erroneous conclusions as well. There is a need to strike a balance between what information is important enough to capture (so as not to slow down the capturing process through needless processing) while still obtaining enough information to acquire the bigger picture of what is going on. Every step of the tracing process requires a degree of human input to decide what network information will end up providing the most complete picture of the network communication and how to interpret that data into meaningful graphs and tables. This can lead to either finds around the focus of the work being done, or even lead to discoveries of other phenomena that end up having far more impact on the overall performance of the system~\cite{Ellard2003}. +A researcher must select which information is important or which information will give the greatest insight to the workings on the network. Too little information can lead to incorrect conclusions being drawn about the workings on the system, while too much information (and not knowing which information is pertinent) can lead to erroneous conclusions as well. There is a need to strike a balance between what information is important enough to capture (so as not to slow down the capturing process through needless processing) while still obtaining enough information to acquire the bigger picture of what is going on. Every step of the tracing process requires a degree of human input to decide what network information will end up providing the most complete picture of the network communication and how to interpret that data into meaningful graphs and tables. This can lead to either finds around the focus of the work being done, or even lead to discoveries of other phenomena that end up having far more impact on the overall performance of the system~\cite{Ellard2003}. Even when all the information is collected and the most important data has been selected, there is still the issue of what lens should be used to view this information. Because the data being collected is from an active network, there will be differing activity depending on the time of day, week, and scholastic year. For example, although the first week or so of the year may contain a lot of traffic, this does not mean that trends of that period of time will occur for every week of the year (except perhaps the final week of the semester). The trends and habits of the network will change based on the time of year, time of day, and even depend on the exam schedule. Truly interesting examination of data requires looking at all different periods of time to see how all these factors play into the communications of the network.