# paw10003 / TracingPaper

Older
100644 1029 lines (922 sloc) 83.9 KB
Apr 16, 2019
1
% This is "sig-alternate.tex" V2.1 April 2013
2
% This file should be compiled with V2.5 of "sig-alternate.cls" May 2012
3
%
4
% This example file demonstrates the use of the 'sig-alternate.cls'
5
% V2.5 LaTeX2e document class file. It is for those submitting
6
% articles to ACM Conference Proceedings WHO DO NOT WISH TO
7
% STRICTLY ADHERE TO THE SIGS (PUBS-BOARD-ENDORSED) STYLE.
8
% The 'sig-alternate.cls' file will produce a similar-looking,
9
% albeit, 'tighter' paper resulting in, invariably, fewer pages.
10
%
11
% ----------------------------------------------------------------------------------------------------------------
12
% This .tex file (and associated .cls V2.5) produces:
13
% 1) The Permission Statement
14
% 2) The Conference (location) Info information
15
% 3) The Copyright Line with ACM data
16
% 4) NO page numbers
17
%
18
% as against the acm_proc_article-sp.cls file which
19
% DOES NOT produce 1) thru' 3) above.
20
%
21
% Using 'sig-alternate.cls' you have control, however, from within
22
% the source .tex file, over both the CopyrightYear
23
% (defaulted to 200X) and the ACM Copyright Data
24
% (defaulted to X-XXXXX-XX-X/XX/XX).
25
% e.g.
26
27
% \crdata{0-12345-67-8/90/12} will cause 0-12345-67-8/90/12 to appear in the copyright line.
28
%
29
% ---------------------------------------------------------------------------------------------------------------
30
% This .tex source is an example which *does* use
31
% the .bib file (from which the .bbl file % is produced).
32
% REMEMBER HOWEVER: After having produced the .bbl file,
33
% and prior to final submission, you *NEED* to 'insert'
34
% your .bbl file into your source .tex file so as to provide
35
% ONE 'self-contained' source file.
36
%
37
% ================= IF YOU HAVE QUESTIONS =======================
38
% Questions regarding the SIGS styles, SIGS policies and
39
% procedures, Conferences etc. should be sent to
40
41
%
42
% Technical questions _only_ to
43
% Gerald Murray (murray@hq.acm.org)
44
% ===============================================================
45
%
46
% For tracking purposes - this is V2.0 - May 2012
47
48
\documentclass[conference]{IEEEtran}
49
50
\usepackage{listings} % Include the listings-package
51
\usepackage{color}
52
\usepackage{balance}
53
\usepackage{graphicx}
54
\usepackage{url}
55
\usepackage{tabularx,booktabs}
56
\usepackage{multirow}
57
\usepackage[normalem]{ulem}
58
\useunder{\uline}{\ul}{}
59
60
\definecolor{darkgreen}{rgb}{0,0.5,0}
61
\definecolor{mygreen}{rgb}{0,0.6,0}
62
\definecolor{mygray}{rgb}{0.5,0.5,0.5}
63
\definecolor{mymauve}{rgb}{0.58,0,0.82}
64
\lstset{ %
65
backgroundcolor=\color{white}, % choose the background color; you must add \usepackage{color} or \usepackage{xcolor}
66
basicstyle=\ttfamily\scriptsize, % the size of the fonts that are used for the code
67
breakatwhitespace=false, % sets if automatic breaks should only happen at whitespace
68
breaklines=true, % sets automatic line breaking
69
captionpos=b, % sets the caption-position to bottom
70
71
deletekeywords={...}, % if you want to delete keywords from the given language
72
73
extendedchars=true, % lets you use non-ASCII characters; for 8-bits encodings only, does not work with UTF-8
74
frame=single, % adds a frame around the code
75
keepspaces=true, % keeps spaces in text, useful for keeping indentation of code (possibly needs columns=flexible)
76
keywordstyle=\color{blue}, % keyword style
77
% language=C, % the language of the code
78
morecomment=[l]{--},
79
morekeywords={property,set,is,type, constant, enumeration, end, applies, to, inherit, of, *,...}, % if you want to add more keywords to the set
80
numbers=left, % where to put the line-numbers; possible values are (none, left, right)
81
numbersep=5pt, % how far the line-numbers are from the code
82
numberstyle=\tiny\color{mygray}, % the style that is used for the line-numbers
83
rulecolor=\color{black}, % if not set, the frame-color may be changed on line-breaks within not-black text (e.g. comments (green here))
84
showspaces=false, % show spaces everywhere adding particular underscores; it overrides 'showstringspaces'
85
showstringspaces=false, % underline spaces within strings only
86
showtabs=false, % show tabs within strings adding particular underscores
87
stepnumber=1, % the step between two line-numbers. If it's 1, each line will be numbered
88
stringstyle=\color{mymauve}, % string literal style
89
tabsize=2, % sets default tabsize to 2 spaces
90
title=\lstname % show the filename of files included with \lstinputlisting; also try caption instead of title
91
}
92
93
\ifCLASSINFOpdf
94
% \usepackage[pdftex]{graphicx}
95
% declare the path(s) where your graphic files are
96
% \graphicspath{{../pdf/}{../jpeg/}}
97
% and their extensions so you won't have to specify these with
98
% every instance of \includegraphics
99
% \DeclareGraphicsExtensions{.pdf,.jpeg,.png}
100
\else
101
% or other class option (dvipsone, dvipdf, if not using dvips). graphicx
102
% will default to the driver specified in the system graphics.cfg if no
103
% driver is specified.
104
% \usepackage[dvips]{graphicx}
105
% declare the path(s) where your graphic files are
106
% \graphicspath{{../eps/}}
107
% and their extensions so you won't have to specify these with
108
% every instance of \includegraphics
109
% \DeclareGraphicsExtensions{.eps}
110
\fi
111
112
\begin{document}
113
%
114
% paper title
115
% can use linebreaks \\ within to get better formatting as desired
116
\title{A Trace-Based Study of SMB Network File System Workloads in an Academic Enterprise}
117
118
%\author{\IEEEauthorblockN{Paul Wortman and John Chandy}
119
%\IEEEauthorblockA{Department of Electrical and Computer Engineering\\
120
%University of Connecticut, USA\\
121
%(paul.wortman, john.chandy)@uconn.edu
122
%}}
123
124
125
% make the title area
126
\maketitle
127
128
\begin{abstract}
129
Storage system traces are important for examining real-world applications, studying potential bottlenecks, as well as driving benchmarks in the evaluation of new system designs.
130
While file system traces have been well-studied in earlier work, it has been some time since the last examination of the SMB network file system.
131
The purpose of this work is to continue previous SMB studies to better understand the use of the protocol in a real-world production system in use at the University of Connecticut.
132
The main contribution of our work is the exploration of I/O behavior in modern file system workloads as well as new examinations of the inter-arrival times and run times for I/O events.
133
We further investigate if the recent standard models for traffic remain accurate.
Dec 21, 2019
134
Our findings reveal interesting data relating to the number of read and write events. We notice that the number of read events far exceeds writes and that the average of bytes transferred over the wire is greater for reads as well. Furthermore we find an increase in the use of metadata for overall network communication that can be taken advantage of through the use of smart storage devices.
Apr 16, 2019
135
\end{abstract}
136
137
\section{Introduction}
138
%Mention:
139
%\begin{itemize}
140
% \item Why is it important to re-examine the SMB protocol?
141
% \item Why does examination of network use matter?
142
% \item Need to ensure hash of data and not saving any of the original traffic packets.
143
%\end{itemize}
144
Over the last twenty years, data storage provisioning has been centralized through the
145
use of network file systems. The architectures of these storage systems can vary from
146
storage area networks (SAN), network attached storage (NAS), clustered file systems,
147
hybrid storage, amongst others. However, the front-end client-facing network file
148
system protocol in most enterprise IT settings tends to be, for the most part, solely
149
SMB (Server Message Block) because of the preponderance of MS Windows clients.
150
While there are other network file systems such as Network File System (NFS) and
151
clustered file systems such as Ceph, PanFS, and OrangeFS, they tend to be used less
152
extensively in most non-research networks.
153
154
In spite of the prevalence of SMB usage within most enterprise networks, there has
155
been very little analysis of SMB workloads in prior academic research. The last major study
156
of SMB was nearly a decade ago~\cite{leung2008measurement}, and the nature of storage
157
usage has changed dramatically over the last decade.
Dec 21, 2019
158
It is always important to revisit commonly used protocols to examine their use in comparison to the expected use case(s). This is doubly so for network communications because the nuances of networked data exchange can greatly influence the effectiveness and efficiency of a chosen protocol. Since an examination of SMB has not occurred in the past decade, we took a look at its current implementation and use in a large university network.
Apr 16, 2019
159
%Due to the sensitivity of the captured information, we ensure that all sensitive information is hashed and that the original network captures are not saved.
160
Dec 21, 2019
161
Our study is based on network packet traces collected on the University of Connecticut's centralized storage facility over a period of three weeks in May 2019. This trace-driven analysis can help in the design of future storage products as well as providing data for future performance benchmarks.
Apr 16, 2019
162
%Benchmarks are important for the purpose of developing technologies as well as taking accurate metrics. The reasoning behind this tracing capture work is to eventually better develop accurate benchmarks for network protocol evaluation.
Dec 21, 2019
163
Benchmarks allow for the stress testing of various aspects of a system (e.g. network, single system). Aggregate data analysis collected from traces can lead to the development of synthetic benchmarks. Traces can expose systems patterns that can also be reflected in synthetic benchmarks. Finally, the traces themselves can drive system simulations that can be used to evaluate prospective storage architectures.
Apr 16, 2019
164
165
%\begin{itemize}
166
% \item \textbf{Why?:} Benchmarks allow for the stress testing of different/all aspects of a system (e.g. network, single system).
167
% \item \textbf{How:} There are three steps'' to creating a benchmark.
168
% \begin{enumerate}
169
% \item Take a trace of an existing system
170
% \begin{itemize}
171
% \item This is important because this information is how one is able to compare the expected actions of a system (theory) against the traced actions (practice) of the system. Leads to the development of later synthetic benchmarks.
172
% \end{itemize}
173
% \item Determine which aspects of the trace of said system (in an educated arbitrary way) are most representative of what occurred'' during the tracing of the system. Leads to discovery of habits/patterns of the system; which is later used for synthetic benchmark.
174
% \item Use discovered information to produce benchmark
175
% \begin{itemize}
176
% \item Done by either running a repeat of the trace of synthetic benchmark created using trends from trace.
177
% \end{itemize}
178
% \end{enumerate}
179
%\end{itemize}
180
181
Out of all the elements that make up the tracing system used for this research, there are a few key aspects that are worth covering due to customization of the system. These key components of the tracing system are the use of PF\_RING to mitigate timing and resource concerns, the use of proper hardware and software to handle incoming data, along with the tweaking of DataSeries code to create analysis tools for the captured data.
182
% PF\_RING section
183
%The addition of PF\_RING lends to the tracing system by minimizing the copying of packets which, in turn, allows for more accurate timestamping of incoming traffic packets being captured ~\cite{Orosz2013,skopko2012loss,pfringWebsite,PFRINGMan}.
184
PF\_RING acts as a kernel module that aids in minimizing packet loss/timestamping issues by not passing packets through the kernel data structures~\cite{PFRINGMan}.
185
%The other reason PF\_RING is instrumental is that it functions with the 10Gb/s hardware that was installed into the Trace1 server; allowing for full throughput from the network tap on the UITS system. \\
186
% DataSeries + Code section
187
The tweaks and code additions to the existing DataSeries work are filtering for specific SMB protocol fields along with the writing of analysis tools to parse and dissect the captured packets. Specific fields were chosen to be the interesting fields kept for analysis. It should be noted that this was done originally arbitrarily and changes/additions have been made as the value of certain fields were determined to be worth examining; e.g. multiple runs were required to refine the captured data for later analysis. The code written for analysis of the captured DataSeries format packets focuses on I/O events and ID tracking (TID/UID). The future vision for this information is to combine ID tracking with the OpLock information in order to track resource sharing of the different clients on the network. As well as using IP information to recreate communication in a larger network trace to establish a better benchmark.
188
189
%Focus should be aboiut analysis and new traces
Dec 21, 2019
190
The contributions of this work are the new traces of SMB traffic over a larger university network as well as new analysis of this traffic. Our new examination of the captured data reveals that despite the streamlining of the CIFS/SMB protocol to be less chatty, the majority of SMB communication in metadata based I/O. We found that read operations occur in greater numbers and cause a larger overall number of bytes to pass over the network. However, the average number of bytes transferred for each write I/O is greater than that of the average read operation. We also find that the current standard for modeling network I/O holds for the majority of operations, while a more representative model needs to be developed for reads.
Apr 16, 2019
191
192
\subsection{Related Work}
193
In this section we discuss previous studies examining traces and testing that has advanced benchmark development. We summarize major works in trace study in Table~\ref{tbl:studySummary}. In addition we examine issues that occur with traces and the assumptions in their study.
194
\begin{table*}[]
195
\centering
196
\begin{tabular}{|r|c|c|c|c|c|}
197
\hline
198
Study & Date of Traces & FS/Protocol & Network FS & Trace Approach & Workload \\ \hline
199
Ousterhout, \textit{et al.}~\cite{ousterhout1985trace} & 1985 & BSD & & Dynamic & Engineering \\ \hline
200
Ramakrishnan, \textit{et al.}~\cite{ramakrishnan1992analysis} & 1988-89 & VAX/VMS & x & Dynamic & Engineering, HPC, Corporate \\ \hline
201
Baker, \textit{et al.}~\cite{baker1991measurements} & 1991 & Sprite & x & Dynamic & Engineering \\ \hline
202
Gribble, \textit{et al.}~\cite{gribble1996self} & 1991-97 & Sprite, NFS, VxFS & x & Both & Engineering, Backup \\ \hline
203
Douceur and Bolosky~\cite{douceur1999large} & 1998 & FAT, FAT32, NTFS & & Snapshots & Engineering \\ \hline
204
Vogels~\cite{vogels1999file} & 1998 & FAT, NTFS & & Both & Engineering, HPC \\ \hline
205
Zhou and Smith~\cite{zhou1999analysis} & 1999 & VFAT & & Dynamic & PC \\ \hline
206
Roselli, \textit{et al.}~\cite{roselli2000comparison} & 1997-00 & VxFS, NTFS & & Dynamic & Engineering, Server \\ \hline
207
Malkani, \textit{et al.}~\cite{malkani2003passive} & 2001 & NFS & x & Dynamic & Engineering, Email \\ \hline
208
Agrawal, \textit{et al.}~\cite{agrawal2007five} & 2000-2004 & FAT, FAT32, NTFS & & Snapshots & Engineering \\ \hline
209
Leung, \textit{et al.}~\cite{leung2008measurement} & 2007 & CIFS & x & Dynamic & Corporate, Engineering \\ \hline
210
%Traeger, \textit{et al.}~\cite{traeger2008nine} & 2008 & FUSE & x & Snapshots & Backup \\ \hline
211
Vrable, \textit{et al.}~\cite{vrable2009cumulus} & 2009 & FUSE & x & Snapshots & Backup \\ \hline
212
Benson, \textit{et al.}~\cite{benson2010network} & 2010 & AFS, MapReduce, NCP, SMB & x & Dynamic & Academic, Corporate \\ \hline
213
Chen, \textit{et al.}~\cite{chen2012interactive} & 2012 & MapReduce & x & Dynamic & Corporate \\ \hline
214
This paper & 2017 & SMB & x & Dynamic & Academic, Engineering, Backup \\ \hline
215
\end{tabular}
216
\caption{Summary of major file system studies over the past decades. For each study the tables shows the dates of the trace data, the file system or protocol studied, whether it involved network file systems, the trace methodology used, and the workloads studied. Dynamic trace studies are those that involve traces of live requests. Snapshot studies involve snapshots of file system contents.}
217
\label{tbl:studySummary}
218
\vspace{-2em}
219
\end{table*}Since
220
221
Tracing collection and analysis has proved its worth in time from previous studies where one can see important lessons pulled from the research; change in behavior of read/write events, overhead concerns originating in system implementation, bottlenecks in communication, and other revelations found in the traces. \\
222
Previous tracing work has shown that one of the largest \& broadest hurdles to tackle is that traces (and benchmarks) must be tailored to the system being tested. There are always some generalizations taken into account but these generalizations can also be a major source of error~\cite{vogels1999file,malkani2003passive,seltzer2003nfs,anderson2004buttress,Orosz2013,dabir2007bottleneck,skopko2012loss,traeger2008nine,ruemmler1992unix}. To produce a benchmark with high fidelity one needs to understand not only the technology being used but how it is being implemented within the system~\cite{roselli2000comparison,traeger2008nine,ruemmler1992unix}. All of these aspects will lend to the behavior of the system; from timing \& resource elements to how the managing software governs actions~\cite{douceur1999large,malkani2003passive,seltzer2003nfs}. Furthermore, in pursuing this work one may find unexpected results and learn new things through examination~\cite{leung2008measurement,roselli2000comparison,seltzer2003nfs}. \\
223
These studies are required in order to evaluate the development of technologies and methodologies along with furthering knowledge of different system aspects and capabilities. As has been pointed out by past work, the design of systems is usually guided by an understanding of the file system workloads and user behavior~\cite{leung2008measurement}. It is for that reason that new studies are constantly performed by the science community, from large scale studies to individual protocol studies~\cite{leung2008measurement,vogels1999file,roselli2000comparison,seltzer2003nfs,anderson2004buttress}. Even within these studies, the information gleaned is only as meaningful as the considerations of how the data is handled. \\
224
225
The work done by Leung et. al.~\cite{leung2008measurement} found observations related to the infrequency of files to be shared by more than one client. Over 67\% of files were never open by more than one client.
226
Leung's \textit{et. al.} work led to a series of observations, from the fact that files are rarely re-opened to finding that read-write access patterns are more frequent ~\cite{leung2008measurement}.
227
%If files were shared it was rarely concurrently and usually as read-only; where 5\% of files were opened by multiple clients concurrently and 90\% of the file sharing was read only.
228
%Concerns of the accuracy achieved of the trace data was due to using standard system calls as well as errors in issuing I/Os leading to substantial I/O statistical errors.
229
% Anderson Paper
230
The 2004 paper by Anderson et. al.~~\cite{anderson2004buttress} has the following observations. A source of decreased precision came from the Kernel overhead for providing timestamp resolution. This would introduce substantial errors in the observed system metrics due to the use inaccurate tools when benchmarking I/O systems. These errors in perceived I/O response times can range from +350\% to -15\%.
231
%I/O benchmarking widespread practice in storage industry and serves as basis for purchasing decisions, performance tuning studies and marketing campaigns.
232
Issues of inaccuracies in scheduling I/O can result in as much as a factor 3.5 difference in measured response time and factor of 26 in measured queue sizes. These inaccuracies pose too much of an issue to ignore.
233
234
Orosz and Skopko examined the effect of the kernel on packet loss in their 2013 paper~\cite{Orosz2013}. Their work showed that when taking network measurements the precision of the timestamping of packets is a more important criterion than low clock offset, especially when measuring packet inter-arrival times and round-trip delays at a single point of the network. One concern is that Dumpcap is a single threaded application and was suspected to be unable to handle new arriving packets due to a small size of the kernel buffer. Work by Dabir and Matrawy, in 2008~\cite{dabir2007bottleneck}, attempted to overcome this limitation by using two semaphores to buffer incoming strings and improve the writing of packet information to disk.
235
236
Narayan and Chandy examined the concerns of distributed I/O and the different models of parallel application I/O.
237
%There are five major models of parallel application I/O. (1) Single output file shared by multiple nodes. (2) Large sequential reads by a single node at the beginning of computation and large sequential writes by a single node at the end of computation. (3) Checkpointing of states. (4) Metadata and read intensive (e.g. small data I/O and frequent directory lookups for reads).
Dec 21, 2019
238
Due to the striping of files across multiple nodes, this can cause any read or write to access all the nodes; which does not decrease the inter-arrival times (IATs) seen. As the number of I/O operations increase and the number of nodes increase, the IAT times decreased.
Apr 16, 2019
239
Observations from Skopko in a 2012 paper~\cite{skopko2012loss} examined the nuance concerns of software based capture solutions. The main observation was software solutions relied heavily on OS packet processing mechanisms. Further more, depending on the mode of operation (e.g. interrupt or polling), the timestamping of packets would change.
240
241
As seen in previous trace work done~\cite{leung2008measurement,roselli2000comparison,seltzer2003nfs}, the general perceptions of how computer systems are being used versus their initial purpose have allowed for great strides in eliminating actual bottlenecks rather than spending unnecessary time working on imagined bottlenecks. Without illumination of these underlying actions (e.g. read-write ratios, file death rates, file access rates) these issues can not be readily tackled.
242
\\
243
244
\section{Background}
245
The Server Message Block (SMB) is an application-layer network protocol mainly used for providing shared access to files, shared access to printers, shared access to serial ports, miscellaneous communications between nodes on the network, as well as providing an authenticated inter-process communication mechanism.
246
%The majority of usage for the SMB protocol involves Microsfot Windows. Almost all implementations of SMB servers use NT Domain authentication to validate user-access to resources
247
The SMB 1.0 protocol has been found to have high/significant impact due to latency issues. Monitoring revealed a high degree of chattiness'' and disregard of network latency between hosts. Solutions to this problem were included in the updated SMB 2.0 protocol which decreases chattiness'' by reducing commands and sub-commands from over a hundred to nineteen. Additional changes, most significantly being increased security, were implemented in SMB 3.0 protocol (previously named SMB 2.2).
248
249
250
The rough order of communication for SMB session file interaction contains about five steps. First is a negotiation where a Microsoft SMB Protocol dialect is determined. Next a session is established to determine the share-level security. After this the Tree ID (TID) is determined for the share to be connected to as well as a file ID (FID) for a file requested by the client. From this establishment, I/O operations are performed using the FID given in the previous step.
251
252
% Information relating to the capturing of SMB information
253
The only data that needs to be tracked from the SMB traces are the UID and TID for each session. The MID value is used for tracking individual packets in each established session. The PID tracks the process running the command or series of commands on a host.
254
255
Some nuances of SMB protocol I/O are:
256
\begin{itemize}
257
\item SMB/SMB2 write request is the command that pushes bytes over the wire. \textbf{Note:} the response packet only confirms their arrival and use (e.g. writing).
258
\item SMB/SMB2 read response is the command that pushes bytes over the wire. \textbf{Note:} The request packet only asks for the data.
259
\end{itemize}
260
% Make sure to detail here how exactly IAT/RT are each calculated
261
262
\begin{figure}
263
\includegraphics[width=0.5\textwidth]{./images/smbPacket.jpg}
264
\caption{Visualization of SMB Packet}
265
\label{fig:smbPacket}
266
\end{figure}
267
268
\subsection{Issues with Tracing}
269
\label{Issues with Tracing}
270
There are three general approaches to creating a benchmark based on a trade-off between experimental complexity and resemblance to the original application. (1) Connect the system to a production test environment, run the application, and measure the application metrics. (2) Collect traces from running the application and replay them (after possible modification) back on the test I/O system. (3) Generate a synthetic workload and measure the system performance.
271
272
The majority of benchmarks are attempts to represent a known system and structure on which some original'' design/system was tested. While this is all well and good, there are many issues with this sort of approach; temporal \& spatial scaling concerns, timestamping and buffer copying, as well as driver operation for capturing packets~\cite{Orosz2013,dabir2007bottleneck,skopko2012loss}. Each of these aspects contribute to the initial problems with dissection and analysis of the captured information. For example, inaccuracies in scheduling I/Os may result in as much as a factor of 3.5 differences in measured response time and factor of 26 in measured queue sizes; differences that are too large to ignore~\cite{anderson2004buttress}.
273
Dealing with timing accuracy and high throughput involves three challenges. (1) Designing for dealing with peak performance requirements. (2) Coping with OS timing inaccuracies. (3) Working around unpredictable OS behavior; e.g. mechanisms to keep time and issue I/Os or performance effects due to interrupts.
274
275
Temporal scaling refers to the need to account for the nuances of timing with respect to the run time of commands; consisting of computation, communication \& service. A temporally scalable benchmarking system would take these subtleties into account when expanding its operation across multiple machines in a network. While these temporal issues have been tackled for a single processor (and even somewhat for cases of multi-processor), these same timing issues are not properly handled when dealing with inter-network communication. Inaccuracies in packet timestamping can be caused due to overhead in generic kernel-time based solutions, as well as use of the kernel data structures ~\cite{PFRINGMan,Orosz2013}. \\
276
Spatial scaling refers to the need to account for the nuances of expanding a benchmark to incorporate a number of (\textbf{n}) machines over a network. A system that properly incorporates spatial scaling is one that would be able to incorporate communication (even in varying intensities) between all the machines on a system, thus stress testing all communicative actions and aspects (e.g. resource locks, queueing) on the network.
277
278
\section{Packet Capturing System}
279
In this section, we examine the packet capturing system as well as decisions made that influence its capabilities. We illustrate the existing university network filesystem as well as our methods for ensuring high-speed packet capture. Then, we discuss the analysis code we developed for examining the captured data.
280
% and on the python dissection code we wrote for performing traffic analysis.
281
282
\subsection{UITS System Overview}
283
We collected traces from the University of Connecticut University Information Technology Services (UITS) centralized storage server. The UITS system consists of five Microsoft file server cluster nodes. These blade servers are used to host SMB file shares for various departments at UConn as well as personal drive share space for faculty, staff and students, along with at least one small group of users. Each server is capable of handling 1~Gb/s of traffic in each direction (e.g. outbound and inbound traffic). All together the five blade server system can in theory handle 10~Gb/s of receiving and transmitting data.
284
%Some of these blade servers have local storage but the majority do not have any.
285
The blade servers serve SMB but the actual storage is served by SAN storage nodes that sit behind them. This system does not currently implement load balancing. Instead, the servers are set up to spread the traffic load among four of the active cluster nodes while the fifth node is passive and purposed to take over in the case that any of the other nodes go down (e.g. become inoperable or crash).
286
287
The topology for the packet duplicating element is shown in Figure~\ref{fig:captureTopology}. For our tracing, we installed a 10~Gb network tap on the file server switch, allowing our storage server to obtain a copy of all network traffic going to the 5 file servers. The reason for using 10Gb hardware is to help ensure that the system is able to capture and all information on the network at peak theoretical throughput.
288
289
290
\subsection{High-speed Packet Capture}
291
\label{Capture}
292
%The packet capturing aspect of the tracing system is fairly straight forward.
293
%On top of the previously mentioned alterations to the system (e.g. PF\_RING), the capture of packets is done through the use of \textit{tshark}, \textit{pcap2ds}, and \textit{inotify} programs.
294
%The broad strokes are that incoming SMB/CIFS information comes from the university's network. All packet and transaction information is passed through a duplicating switch that then allows for the tracing system to capture these packet transactions over a 10 Gb port. These packets are
295
%passed along to the \textit{tshark} packet collection program which records these packets into a cyclical capturing ring. A watchdog program (\textit{inotify}) watches the directory where all of these packet-capture (pcap) files are being stored. As a new pcap file is completed \textit{inotify} passes the file to \textit{pcap2ds} along with what protocol is being examined (i.e. SMB). The \textit{pcap2ds} program reads through the given pcap files,
296
297
In order to maximize our faithful capture of the constant rate of traffic, we implement an ntop~\cite{ntopWebsite} solution called PF\_RING~\cite{pfringWebsite} to dramatically improve the storage server's packet capture speed.
298
%A license was obtained for scholastic use of PF\_RING. PF\_RING implements a ring buffer to provide fast and efficient packet capturing. Having implemented PF\_RING, the next step was to
299
We had to tune an implementation of \texttt{tshark} (wireshark's terminal pcap implementation) to maximize the packet capture and dissection into the DataSeries format~\cite{dataseriesGit}.
300
%The assumption being made is that PF\_RING tackles and takes care of the concerns of packets loss due to buffer size, storage, and writing. \textit{tshark} need only read in those packets and generate the necessary DataSeries (ds) files.
301
To optimize this step a capture ring buffer flag is used to minimize the amount of space used to write pcap files, while optimizing the amount of time to
302
%\textit{pcap2ds} can
303
filter data from the pcap files.
304
The filesize used was in a ring buffer where each file captured was 64000 kB.
305
% This causes tshark to switch to the next file after it reaches a determined size.
306
%To simplify this aspect of the capturing process, the entirety of the capturing, dissection, and permanent storage was all automated through watch-dog scripts.
307
\begin{figure*}
308
\includegraphics[width=\textwidth]{./images/packetcapturetopology.png}
Dec 21, 2019
309
\caption{\textcolor{red}{Visualization of Packet Capturing System}}
Apr 16, 2019
310
\label{fig:captureTopology}
311
\end{figure*}
312
The system for taking captured \texttt{.pcap} files and writing them into the DataSeries format (i.e. \texttt{.ds}) does so by first creating a structure (based on a pre-written determination of the data desired to capture). Once the code builds this structure, it then reads through the capture traffic packets while dissecting and filling in the prepared structure with the desired information and format.
Dec 21, 2019
313
Due to the fundamental nature of this work, there is no need to track every piece of information that is exchanged, only that information which illuminates the behavior of the clients \& servers that function over the network (e.g. I/O transactions). It should also be noted that all sensitive information being captured by the tracing system is hashed to protect the users whose information is examined by the tracing system. Further more, we now only receive the SMB header information since that contains the I/O information we seek, while the body of the SMB traffic is not passed through to better ensure security of the university's network communications. It is worth noting that in the case of larger SMB headers, some information is lost, but this is a trade-off by the university to provide, on average, the correct sized SMB header but does lead to scenarios where some information may be captured incompletely.
Apr 16, 2019
314
315
\subsection{DataSeries Analysis}
316
317
Building upon existing code for the interpretation and dissection of the captured \texttt{.ds} files, we developed C/C++ code for examining the captured traffic information. From this analysis, a larger text file is created that contains read, write, create and general I/O information at both a global scale and individual tracking ID (UID/TID) level. In addition, read and write buffer size information is tracked, as well as the inter-arrival and response times. Also included in this data is oplock information and IP addresses. The main contribution of this step is to aggregate seen information for later interpretation of the results.
318
This step also creates an easily digestible output that can be used to re-create all tuple information for SMB/SMB2 sessions that are witnessed over the entire time period.
319
Sessions are any communication where a valid UID and TID is used.
320
321
%\subsection{Python Dissection}
322
%The final step of our SMB/SMB2 traffic analysis system is the dissection of the \texttt{AnalysisModule} output using the pandas data analysis library~\cite{pandasPythonWebsite}. The pandas library is a python implementation similar to R. In this section of the analysis structure, the generated text file is tokenized and placed into specific DataFrames representing the data seen for each 15 minute period. The python code is used for the analysis and further dissection of the data. This is where the cumulative distribution frequency and graphing of collected data is performed. Basic analysis and aggregation is also performed in this part of the code. This analysis includes the summation of individual session I/O (e.g. reads, writes, creates) as well as the collection of inter arrival time data and response time data.
323
324
\section{Data Analysis}
325
\label{sec:data-analysis}
326
327
\begin{table}[]
328
\centering
329
\begin{tabular}{|l|l|}
330
\hline
331
% & Academic Engineering \\ \hline
332
%Maximum Tuples in 15-min Window & 36 \\ %\hline
333
%Total Tuples Seen & 2721 \\ \hline
334
%\textcolor{red}{Maximum Sessions in 15-min Window} & 35 \\ %\hline
335
%Maximum Non-Session in 15-min Window & 2 \\ \hline
Dec 21, 2019
336
Total Days & 21 \\ %\hline
337
Total Sessions & 2413589 \\ %\hline
338
%Total Non-Sessions & 279006484 \\ \hline
339
Number of SMB Operations & 281419686 \\ %\hline
340
Number of Read I/Os & 8355557
Apr 16, 2019
341
\\ %\hline
Dec 21, 2019
342
Number of Write I/Os & 7872219 \\ %\hline
343
R:W I/O Ratio & 1.06 \\ %\hline
344
Number of Creates & 54486043 \\ %\hline
345
Number of General SMB Operations & 210705867 \\ \hline
346
Total Data Read (GB) & 0.97 \\ %\hline
347
Total Data Written (GB) & 0.6 \\ %\hline
Apr 16, 2019
348
Average Read Size (B) & 144 \\ %\hline
349
Average Write Size (B) & 63 \\ \hline
350
%Percentage of Read Bytes of Total Data & 99.4\% \\ %\hline
351
%Percentage of Written Bytes of Total Data & 0.6\% \\ %\hline
352
%Total R:W Byte Ratio & 166.446693744549 \\ %\hline
353
%Average R:W Byte Ratio & 0.253996031053668 \\ \hline
354
\end{tabular}
355
\label{tbl:TraceSummaryTotal}
Dec 21, 2019
356
\caption{Summary of Trace I/O Statistics for the time of April 30th, 2019 to May 20th, 2019}
Apr 16, 2019
357
\vspace{-2em}
358
\end{table}
359
% NOTE: Not sure but this reference keeps referencing the WRONG table
360
361
Table~\ref{tbl:TraceSummaryTotal}
Dec 21, 2019
362
shows a summary of the I/O operations, response times, and inter arrival times observed for the network filesystem. This table illustrates that the majority of I/O operations are general; showing that $74.87$\% of the network file system I/O are metadata operations.
Apr 16, 2019
363
Dec 21, 2019
364
Our examination of the collected network filesystem data revealed interesting patterns for the current use of CIFS/SMB in a large engineering academic setting. The first is that there is a major shift away from read and write operations towards more metadata-based ones. This matches the last CIFS observations made by Leung et.~al.~that files were being generated and accessed infrequently. The change in operations are due to a movement of use activity from reading and writing data to simply checking file and directory metadata. However, since the earlier study, SMB has transitioned to the SMB2 protocol which was supposed to be less "chatty" and thus we would expect fewer general SMB operations. Table~\ref{tbl:SMBCommands} shows a breakdown of SMB and SMB2 usage over the time period of May. From this table, one can see that despite the fact that the SMB2 protocol makes up $99.14$\% of total network operations compared to just $0.86$\% for SMB, indicating that most clients have upgraded to SMB2. However, $74.66$\% of SMB2 I/O are still general operations. Contrary to purpose of implementing the SMB2 protocol, there is still a large amount of general I/O.
Apr 16, 2019
365
%While CIFS/SMB protocol has less metadata operations, this is due to a depreciation of the SMB protocol commands, therefore we would expect to see less total operations (e.g. $0.04$\% of total operations).
366
%The infrequency of file activity is further strengthened by our finding that within a week long window of time there are no Read or Write inter arrival times that can be calculated.
367
%\textcolor{red}{XXX we are going to get questioned on this. its not likely that there are no IATs for reads and writes}
Dec 21, 2019
368
General operations happen at very high frequency with inter arrival times that were found to be relatively short (1317$\mu$s on average).
Apr 16, 2019
369
Dec 21, 2019
370
Taking a deeper look at the SMB2 operations, shown in the bottom half of Table~\ref{tbl:SMBCommands}, we see that $9.06$\% of the general operations are negotiate commands. These are commands sent by the client to notify the server which dialects of the SMB2 protocol the client can understand. The three most common commands are close, tree connect, and query info.
371
The latter two relate to metadata information of shares and files accessed, however the close operation relates to the create operations relayed over the network. Note that the create command is also used as an open file. The first thing one will notice is that the number of closes is greater than the total number of create operations; by $9.35$\%. These extra close operations are most likely due to applications doing multiple closes that do not need to be done.
Apr 16, 2019
372
373
\begin{table}
374
\centering
375
\begin{tabular}{|l|c|c|c|}
376
\hline
377
I/O Operation & SMB & SMB2 & Both \\ \hline
Dec 21, 2019
378
Read Operations & 1931 & 8353626 & 8355557 \\
379
Read \% & 0.08\% & 2.99\%& 2.97\%\\
380
Write Operations & 303 & 7871916 & 7872219 \\
381
Write \% & 0.01\% & 2.82\% & 2.80\% \\
382
Create Operations & 0 & 54486043 & 54486043 \\
383
Create \% & 0.00\% & 19.53\% & 19.36\% \\
384
General Operations & 2418980 & 208286887 & 210705867 \\
385
General \% & 99.91\% & 74.66\% & 74.87\% \\ \hline
386
Combine Protocol Operations & 2421214 & 278998472 & 281419686 \\
387
Combined Protocols \% & 0.86\% & 99.14\% & 100\% \\ \hline
Apr 16, 2019
388
%\end{tabular}
389
%\caption{\label{tbl:SMBCommands}Percentage of SMB and SMB2 Protocol Commands on March 15th}
390
%\end{table}
391
%\begin{table}
392
%\centering
393
%\begin{tabular}{|l|c|c|}
394
\hline \hline
395
SMB2 General Operation & \multicolumn{2}{|c|}{Occurrences} & Percentage of Total \\ \hline
Dec 21, 2019
396
Negotiate & \multicolumn{2}{|c|}{25276447} & 9.06\% \\
397
Session Setup & \multicolumn{2}{|c|}{2041208} & 0.73\%\\
398
Logoff & \multicolumn{2}{|c|}{143592} & 0.05\% \\
399
Tree Connect & \multicolumn{2}{|c|}{48414491} & 17.35\% \\
400
Tree Disconnect & \multicolumn{2}{|c|}{9773361} & 3.5\% \\
401
Close & \multicolumn{2}{|c|}{80114256} & 28.71\% \\
402
Flush & \multicolumn{2}{|c|}{972790} & 0.35\% \\
403
Lock & \multicolumn{2}{|c|}{1389250} & 0.5\% \\
404
IOCtl & \multicolumn{2}{|c|}{4475494} & 1.6\% \\
Apr 16, 2019
405
Cancel & \multicolumn{2}{|c|}{0} & 0.00\% \\
Dec 21, 2019
406
Echo & \multicolumn{2}{|c|}{4715} & 0.002\% \\
407
Query Directory & \multicolumn{2}{|c|}{3443491} & 1.23\% \\
408
Change Notify & \multicolumn{2}{|c|}{612850} & 0.22\% \\
409
Query Info & \multicolumn{2}{|c|}{27155528} & 9.73\% \\
410
Set Info & \multicolumn{2}{|c|}{4447218} & 1.59\% \\
411
Oplock Break & \multicolumn{2}{|c|}{22397} & 0.008\% \\ \hline
Apr 16, 2019
412
\end{tabular}
Dec 21, 2019
413
\caption{\label{tbl:SMBCommands}Percentage of SMB and SMB2 Protocol Commands from April 30th, 2019 to May 20th, 2019. Breakdown of General Operations for SMB2}
Apr 16, 2019
414
\vspace{-2em}
415
\end{table}
416
Dec 21, 2019
417
\begin{table}[]
418
\centering
419
\begin{tabular}{|l|l|l|}
420
\hline
421
SMB2 Filename Extension & Occurrences & Percentage of Total \\ \hline
422
-Travel & 33396147 & 15.26 \\
423
o & 28670784 & 13.1 \\
424
e & 28606421 & 13.07 \\
425
N & 27639457 & 12.63 \\
426
one & 27615505 & 12.62 \\
427
\textless{}No Extension\textgreater{} & 27613845 & 12.62 \\
428
d & 2799799 & 1.28 \\
429
l & 2321338 & 1.06 \\
430
x & 2108279 & 0.96 \\
431
h & 2019714 & 0.92 \\ \hline
432
\end{tabular}
433
\caption{\textcolor{red}{Top 10 File Extensions Seen Over Three Week Period}}
434
\label{tab:top10SMB2FileExts}
435
\end{table}
436
437
\begin{table}[]
438
\centering
439
\begin{tabular}{|l|l|l|}
440
\hline
441
SMB2 Filename Extension & Occurrences & Percentage of Total \\ \hline
442
doc & 352958 & 0.16 \\
443
docx & 291047 & 0.13 \\
444
ppt & 46706 & 0.02 \\
445
pptx & 38604 & 0.02 \\
446
xls & 218031 & 0.1 \\
447
xlsx & 180676 & 0.08 \\
448
odt & 28 & 1.28e-05 \\
449
pdf & 375601 & 0.17 \\
450
xml & 1192840 & 0.54 \\
451
txt & 167827 & 0.08 \\ \hline
452
\end{tabular}
453
\caption{\textcolor{red}{Common File Extensions Seen Over Three Week Period}}
454
\label{tab:commonSMB2FileExts}
455
\end{table}
456
Apr 16, 2019
457
\subsection{I/O Data Request Sizes}
Dec 21, 2019
458
%\textcolor{red}{Figures~\ref{fig:IO-All} and~\ref{fig:IO-R+W} show the amount of I/O in 15-minute periods during the week of March 12-18, 2017.
459
%The general I/O (GIO) value is representative of I/O that does not include read, write, or create actions. For the most part, these general I/O are mostly metadata operations. As one can see in Figure~\ref{fig:IO-All}, the general I/O dominates any of the read or write operations. Figure~\ref{fig:IO-R+W} is a magnification of the read and write I/O from Figure~\ref{fig:IO-All}. Here we see that the majority of I/O operations belong to reads. There are some spikes where more write I/O occur, but these events are in the minority. One should also notice that, as would be expected, the spikes of I/O activity occur around the center of the day (e.g. 8am to 8pm), and during the week (March 12 was a Sunday and March 18 was a Saturday).}
460
Apr 16, 2019
461
%\begin{figure}
462
% \includegraphics[width=0.5\textwidth]{./images/AIO.pdf}
463
% \caption{All I/O}
464
% \label{fig:IO-All}
465
%\end{figure}
466
%\begin{figure}
467
% \includegraphics[width=0.5\textwidth]{./images/RWIO-win.pdf}
468
469
% \label{fig:IO-R+W}
470
%\end{figure}
Dec 21, 2019
471
Figures~\ref{fig:PDF-Bytes-Read} \&~\ref{fig:PDF-Bytes-Write} show the probability density function (PDF) of the different sizes of bytes transferred for read and write I/O operations; respectively. The most noticeable aspect of these graphs are that the majority of bytes transferred for read and write operations is around 64 bytes. It is worth noting that write I/O also have a larger number of very small transfer amounts. This is unexpected in terms of the amount of data passed in a frame. Our belief is that this is due to a large number of long term calculations/scripts being run that only require small but frequent updates. This assumption was later validated in part when examining the files transferred, as some were related to running scripts creating a large volume of files.
Apr 16, 2019
472
473
474
%\begin{figure}
475
% \includegraphics[width=0.5\textwidth]{./images/aggAvgBytes.pdf}
476
% \caption{Average Bytes by I/O}
477
% \label{fig:Agg-AvgBytes}
478
%\end{figure}
479
%
480
%\begin{figure}
481
% \includegraphics[width=0.5\textwidth]{./images/bytesCompare.pdf}
482
% \caption{Total Bytes by I/O}
483
% \label{fig:bytesCompare}
484
%\end{figure}
485
Dec 21, 2019
486
\begin{figure}
487
488
\caption{PDF of Bytes Transferred for Read I/O}
489
490
\end{figure}
491
Apr 16, 2019
492
\begin{figure}
493
494
\caption{CDF of Bytes Transferred for Read I/O}
495
496
\end{figure}
497
Dec 21, 2019
498
\begin{figure}
499
\includegraphics[width=0.5\textwidth]{./images/smb_write_bytes_pdf.pdf}
500
\caption{PDF of Bytes Transferred for Write I/O}
501
\label{fig:PDF-Bytes-Write}
502
\end{figure}
503
Apr 16, 2019
504
\begin{figure}
505
\includegraphics[width=0.5\textwidth]{./images/smb_write_bytes_cdf.pdf}
506
\caption{CDF of Bytes Transferred for Write I/O}
507
\label{fig:CDF-Bytes-Write}
508
\end{figure}
509
510
%\begin{figure}
511
% \includegraphics[width=0.5\textwidth]{./images/CDF-ioBuff-win.pdf}
512
% \caption{CDF of Bytes Transferred for Read+Write I/O}
513
% \label{fig:CDF-Bytes-RW}
514
%\end{figure}
Dec 21, 2019
515
Figures~\ref{fig:CDF-Bytes-Read} and~\ref{fig:CDF-Bytes-Write} show cumulative distribution functions (CDF) for bytes read and bytes written. As can be seen the bytes transferred via reads increases by over $50$\% starting at 32 bytes, while the writes have approximately $20$\% below 32 bytes. Table~\ref{fig:transferSizes} shows a tabular view of this data. For reads, $34.97$\% are between 64 and 512 bytes, with another $28.86$\% at 64 byte request sizes. There are a negligible percentage of read requests larger than 512.
516
This read data is similar to what was observed by Leung et al. Writes, on the other hand, are very different. Leung et al. showed that writes were $60$-$70$\% less than 4K and $90$\% less than 64K. In our data, however, we see that only $11.16$\% of writes are less than 4K, $52.41$\% are 64K requests, and only $43.63$\% of requests are less than 64K writes.
517
In the ten years since the last study, it is clear that writes have become significantly larger. This may be explained by the fact that large files, and multiple files, are being written as standardized blocks more fitting to the larger data-sets and disk space available. This could be as an effort to improve the fidelity of data across the network, allow for better realtime data consistency between client and backup locations, or could just be due to a large number of scripts being run that create and update a series of relatively smaller documents.
518
%\textbf{Note: It seems like a change in the order of magnitude that is being passed per packet. What would this indicate?}\textcolor{red}{Answer the question. Shorter reads/writes = better?}
Apr 16, 2019
519
520
\begin{table}
521
\centering
522
\begin{tabular}{|l|c|c|}
523
\hline
524
Transfer size & Reads & Writes \\ \hline
Dec 21, 2019
525
$< 4$ & 0.098\% & 11.16\% \\
526
$= 4$ & 1.16\% & 4.13\% \\
527
$>4, < 64$ & 34.89\% & 28.14\% \\
528
$= 64$ & 28.86\% & 52.41\% \\
529
$>64, < 512$ & 34.97\% & 4.15\% \\
530
$= 512$ & 0.002\% & 2.54e-5\% \\
531
$= 1024$ & 1.22e-5\% & 3.81e-5\% \\ \hline
Apr 16, 2019
532
\end{tabular}
533
\caption{\label{fig:transferSizes}Percentage of transfer sizes for reads and writes}
534
\vspace{-2em}
535
\end{table}
536
Dec 21, 2019
537
In comparison of the read, write, and create operations we found that the vast majority of these type of I/O belong to creates. Furthermore, read operations account for the largest aggregate of bytes transferred over the network. However, the amount of bytes transferred by write commands is not far behind, although, non-intuitively, including a larger number of standardized relatively smaller writes. The most unexpected finding of the data is that all the the read and writes are performed using much smaller buffers than expected; about an order of magnitude smaller (e.g. bytes instead of kilobytes).
Apr 16, 2019
538
539
% XXX I think we should get rid of this figure - not sure it conveys anything important that is not better conveyed than the CDF
540
%Figure~\ref{fig:Agg-AvgRT} shows the average response time (RT) for the different I/O operations. The revealing information is that write I/Os take the longest average time. This is expected since writes transfer more data on average. There is an odd spike for create I/O which can be due to a batch of files or nested directories being made. There are points where read I/O RT can be seen, but this only occurs in areas where large RT for write I/O occur. This is attributed to a need to verify the written data.
541
542
%\begin{figure}
543
% \includegraphics[width=0.5\textwidth]{./images/aggAvgRTs-windowed.pdf}
544
% \caption{Average Response Time by I/O Operation}
545
% \label{fig:Agg-AvgRT}
546
%\end{figure}
547
548
% XXX I think we should get rid of this figure - not sure it conveys anything important that is not better conveyed than the CDF
549
%Figure~\ref{fig:Agg-AvgBytes} shows the average inter arrival time (IAT) for the different I/O operations. \textcolor{red}{Issue: Data only exists for general operations, NOT for other operations. In other words, data for all other operations was IAT of zero.} \textcolor{blue}{Idea: This is due to single operation by a single user and then no operation being performed again. This would aligns with the ideas of Lueng et.~al.~who noticed that files were being interacted with only once or twice and then not again.}
550
551
%\begin{figure}
552
% \includegraphics[width=0.5\textwidth]{./images/aggAvgIATs-windowed.pdf}
553
% \caption{Average Inter Arrival Time by I/O Operation}
554
% \label{fig:Agg-AvgIAT}
555
%\end{figure}
556
557
%The following is a list of data collected and why:
558
%\begin{itemize}
559
% \item TID-to-IP map: with the hashing, the only way to maintain mapping of share-types' (i.e. share-paths) to TIDs is via IP (reverse DNS).
560
% \item FID Data: holds the number of reads, writes, and size of the FID (tracked) for which this information is tracked (per FID).
561
% \item Tuple Data: holds the reads and writes performed by a seen tuple (per tuple) along with by the tuple and FID's data.
562
% \item TID Data: holds the number of reads, writes, creates, and total I/O events along with the last time each/any command was seen. Maps are kept of the buffs seen, general IAT, read IAT, write IAT, create IATs.
563
% \item Tuple Info: Tracking the tuples seen along with a map to that tuple's (per tuple) data.
564
% \item Oplock Data: Tracks the different types of oplocks that are seen per 15 minutes.
565
% \item Read/Write Buff: Maps that are used to track the different sized buffers used for Read/Write commands.
566
% \item filesizeMap': Used for track the different sized buffers to pass data along the network (generic and all inclusive; ie. tuple level data).
567
% \item I/O Events: Track the number of I/O events seen in 15 minute periods. I/Os include - read, write, create, general.
568
%\end{itemize}
569
570
\subsection{I/O Response Times}
571
572
\begin{table}[]
573
\centering
574
\begin{tabular}{|l|l|l|l|l|}
575
\hline
576
& Reads & Writes & Creates & General \\ \hline
Dec 21, 2019
577
I/O \% & 2.97 & \multicolumn{1}{l|}{2.80} & \multicolumn{1}{l|}{19.36} & \multicolumn{1}{l|}{74.87} \\ \hline
578
Avg RT ($\mu$s) & 59819.687407 & \multicolumn{1}{l|}{519.703834} & \multicolumn{1}{l|}{698.082457} & \multicolumn{1}{l|}{7013.37566} \\ \hline
579
Avg IAT ($\mu$s) & 33220.780444 & \multicolumn{1}{l|}{35260.421498} & \multicolumn{1}{l|}{5094.474400} & \multicolumn{1}{l|}{1317.374383} \\ \hline
Apr 16, 2019
580
%\hline
581
%Total RT (s) & 224248 & \multicolumn{1}{l|}{41100} & \multicolumn{1}{l|}{342251} & \multicolumn{1}{l|}{131495} \\ \hline
582
%\% Total RT & 30.34\% & \multicolumn{1}{l|}{5.56\%} & \multicolumn{1}{l|}{46.3\%} & \multicolumn{1}{l|}{17.79\%} \\ \hline
583
\end{tabular}
584
\caption{Summary of Trace Statistics: Average Response Time (RT) and Inter Arrival Time (IAT)}
585
\label{tbl:PercentageTraceSummary}
586
\vspace{-2em}
587
\end{table}
588
589
%\begin{table}[]
590
%\centering
591
%\begin{tabular}{|l|l|l|l|l|l|}
592
%\hline
593
% & Reads & Writes & Creates & General R-W \\ \hline
594
%Total RT (ms) & 224248442 & \multicolumn{1}{l|}{41100075} & \multicolumn{1}{l|}{342251439} & \multicolumn{1}{l|}{131495153} & \multicolumn{1}{l|}{258573201} \\ \hline
595
%\% Total RT & 30.34\% & \multicolumn{1}{l|}{5.56\%} & \multicolumn{1}{l|}{46.3\%} & \multicolumn{1}{l|}{17.79\%} & \multicolumn{1}{l|}{34.99\%} \\ \hline
596
%\end{tabular}
597
%\caption{Summary of Response Time (RT) Statistics: Total RT and Percentage RT per Operation}
598
%\label{tbl:PercentageRTSummary}
599
%\end{table}
600
601
%~!~ Addition since Chandy writing ~!~%
602
Most previous tracing work has not provided data on I/O response times or command latency which serves as an approximation of server load. In
Dec 21, 2019
603
Table~\ref{tbl:PercentageTraceSummary} we show a summary of the response times for read, write, create, and general commands. We note that most general operations have the second longest average response times ($7013.38$ $\mu$s). This exemplifies that these general operations occur in great numbers, run relatively slowly, and happen at high frequency.
604
Other observations of the data show that the number of writes are very close to the number of reads, although the response time for their operations is the shortest. Creates happen more often, but have a slightly slower response time, because most of the create commands are actually opens. Although read operations are only a few percentage of the total operations they have a the greatest average response time; more than general I/O.
Apr 16, 2019
605
Dec 21, 2019
606
%\textcolor{red}{To get an indication of how much of an effect these general commands take on overall latency, we also calculated the total aggregate response time for read, write, create, and general operations. We see that even though general commands account for $74.87$\% of all commands, they only account for only $17.8$\% of the total response time. Thus, while the volume of general operations does not present an extraordinary burden on server load, reducing these operations can present a clear performance benefit. We also see that creates take the most amount of time ($46.3$\%) of the total response time for all operations. As seen in Table~\ref{tbl:SMBCommands}, the majority of general operations are negotiations while $28.71$\% are closes; which relate to create operations.
607
%This shows that while creates are only $5.08$\% on March 15th (and $2.5$\% of the week's operations shown in Table~\ref{tbl:PercentageTraceSummary}) of the total operations performed, they are responsible for $46.3$\% of the time spent performing network I/O.}
608
%\textbf{Do we need this above data piece?}
Apr 16, 2019
609
%
610
%% Not Needed to Say Since we have no data
611
%%One key observation is that there were no inter arrival time calculations for read, write, or create operations. We interpret this data to reflect the observations of Leung et.~al.~that noticed that files are interacted with only a few times and then not interacted with again. Extrapolating this concept, we interpret the data to illustrate that files may be read or written once, but then are not examined or interacted with again.
612
%%\textcolor{blue}{This was entirely unexpected and was discovered as a result of our original assumptions made based on what scope we believed to be the best interpretation of user activity on the network filesystem.}
613
%
614
%%\begin{table}[]
615
%%\centering
616
%%\begin{tabular}{|l|l|}
617
%%\hline
618
%% & Count \\ \hline
619
%%Sessions & 122 \\ \hline
620
%%Non-Sessions & 2 \\ \hline
621
%%\end{tabular}
622
%%\caption{Summary of Maximum Session and Non-Session Seen}
623
%%\label{tbl:Counts}
624
%%\end{table}
625
%%
626
%%\textcolor{red}{Not sure if presenting a count of the number of sessions seen is important or worth showing.}
627
%
628
%%\begin{table}[]
629
%%\centering
630
%%\begin{tabular}{|l|l|l|}
631
%%\hline
632
%% & Reads & Writes \\ \hline
633
%%Average & 27167.76 B & 106961.36 B \\ \hline
634
%%Percentage & 99.4\% & 0.6\% \\ \hline
635
%%\end{tabular}
636
%%\caption{Summary of Bytes Transferred Over the Network}
637
%%\label{tbl:Bytes}
638
%%\end{table}
639
%
640
%%\textcolor{red}{Reference the large single table instead}
641
%%Table~\ref{tbl:TraceSummary} shows our findings relating to the total number of bytes transferred over the network due to Read and Write operations. Mimicing the findings from Figure~\ref{fig:Agg-AvgBytes}, the table shows that while the percentage of total bytes passed over the network is dominated by Read operations the average bytes pushed by Write operations is of a magnitude greater.
642
%
643
%%Tables to be included:
644
%%\begin{enumerate}
645
%% \item Return Times:
646
%% \begin{itemize}
647
%% \item General
648
649
%% \item Write
650
%% \item Create
651
652
%% \end{itemize}
653
%% \item Inter Arrival Times
654
%% \begin{itemize}
655
%% \item General
656
657
%% \item Write
658
%% \item Create
659
660
%% \end{itemize}
661
%% \item Bytes per Request (Bytes Over Network)
662
%% \begin{itemize}
663
664
%% \item Write
665
666
%% \end{itemize}
667
%%\end{enumerate}
668
%%Modeling to include:
669
%%\begin{enumerate}
670
%% \item Inter Arrival Time CDF
671
%% \begin{itemize}
672
673
%% \item Write
674
675
%% \end{itemize}
676
%%\end{enumerate}
677
%
Dec 21, 2019
678
Figure~\ref{fig:CDF-IAT-General} shows the inter arrival times CDF for general I/O. As can be seen, SMB commands happen very frequently - $85$\% of commands are issued less than 1024~$\mu s$ apart. As was mentioned above, the SMB protocol is known to be very chatty, and it is clear that servers must spend a lot of time dealing with these commands. For the most part, most of these commands are also serviced fairly quickly as well as seen in Figure~\ref{fig:CDF-RT-General}. Interestingly, the response/return time (RT) for the general metadata operations follows a similar curve to the inter-arrival times.
Apr 16, 2019
679
Dec 21, 2019
680
Next we examine the response time (RT) of the read, write, and create I/O operations that occur over the SMB network filesystem. The response time for write operations (shown in Figure~\ref{fig:CDF-RT-Write}) does not follow the step function similar to the bytes written CDF in Figure~\ref{fig:CDF-Bytes-Write}. This is understandable as the response time for a write would be expected to be a more standardized action and not necessarily proportional to the number of bytes written. However, the read response time (Figure~\ref{fig:CDF-RT-Read}) is smoother than the bytes read CDF (Figure~\ref{fig:CDF-Bytes-Write}). This is most likely due to the fact that some of the reads are satisfied by server caches, thus eliminating some long access times to persistent storage.
681
However, one should notice that the response time on read operations grows at a rate similar to that of write operations. This, again, shows a form of standardization in the communication patterns although some read I/O take a far greater period of time; due to larger amounts of read data sent over several standardized size packets.
Apr 16, 2019
682
%While the RT for Write operations are not included (due to their step function behavior) Figure~\ref{fig:CDF-RT-Read} and Figure~\ref{fig:CDF-RT-RW} show the response times for Read and Read+Write operations respectively. T
Dec 21, 2019
683
%\textcolor{red}{The write I/O step function behavior is somewhat visible in the CDF of both reads and writes in Figures~\ref{fig:CDF-RT-Read}~and~\ref{fig:CDF-RT-Write}. Moreover, this shows that the majority ($80$\%) of read (and write) operations occur within 2~$ms$, the average access time for enterprise storage disks. As would be expected, this is still an order of magnitude greater than the general I/O.}
684
685
\subsection{File Extensions}
686
Tables~\ref{tab:top10SMB2FileExts} and~\ref{tab:commonSMB2FileExts} show a representation of the various file extensions that were seen within the three week capture period. The easier to understand is Table~\ref{tab:commonSMB2FileExts}, which illustrates the number of common file extensions (e.g. doc, ppt, xls, pdf) that were part of the data.
687
The greatest point of note is that the highest percentage is .xml'' with $0.54$\%, which is found to be surprising result. Originally we expected that these common file extensions would be a much larger total of traffic; more than $2$\% of total traffic. These concerns were further raised by the results of Table~\ref{tab:top10SMB2FileExts} which show the top ten file extensions present in the data; which make up approximately $84$\% of the total seen.
688
Furthermore the majority of extensions seem very strange if not nonsensical. Upon closer examination of the tracing system it was determined that these file extensions are in artifact of how Windows interprets file extensions. The Windows operating system merely guesses the file type based on the assumed extension (e.g. whatever characters follow after the final `.').
689
There are a large number of files that do not meet this standard idea of having an extension, although we posit that an assortment of scenarios that would cause this issue. These range from linux-based library files, manual pages, odd naming schemes as part of scripts or back-up files, as well as date-times and IPs as file names. There are undoubtedly a larger number more, but exhaustive determination of all variations is seen as out of scope for this work.
Apr 16, 2019
690
691
% Note: RT + IAT time CDFs exist in data output
692
693
% IAT information
694
695
\begin{figure}
696
\includegraphics[width=0.5\textwidth]{./images/smb_general_iats_cdf.pdf}
697
\caption{CDF of Inter Arrival Time for General I/O}
698
\label{fig:CDF-IAT-General}
699
\end{figure}
700
701
\begin{figure}
702
\includegraphics[width=0.5\textwidth]{./images/smb_general_iats_pdf.pdf}
703
\caption{PDF of Inter Arrival Time for General I/O}
704
\label{fig:PDF-IAT-General}
705
\end{figure}
706
707
\begin{figure}
708
709
\caption{CDF of Inter Arrival Time for Read I/O}
710
711
\end{figure}
712
713
\begin{figure}
714
715
\caption{PDF of Inter Arrival Time for Read I/O}
716
717
\end{figure}
718
719
\begin{figure}
720
\includegraphics[width=0.5\textwidth]{./images/smb_write_iats_cdf.pdf}
721
\caption{CDF of Inter Arrival Time for Write I/O}
722
\label{fig:CDF-IAT-Write}
723
\end{figure}
724
725
\begin{figure}
726
\includegraphics[width=0.5\textwidth]{./images/smb_write_iats_pdf.pdf}
727
\caption{PDF of Inter Arrival Time for Write I/O}
728
\label{fig:PDF-IAT-Write}
729
\end{figure}
730
731
\begin{figure}
732
\includegraphics[width=0.5\textwidth]{./images/smb_create_iats_cdf.pdf}
733
\caption{CDF of Inter Arrival Time for Create I/O}
734
\label{fig:CDF-IAT-Create}
735
\end{figure}
736
737
\begin{figure}
738
\includegraphics[width=0.5\textwidth]{./images/smb_create_iats_pdf.pdf}
739
\caption{PDF of Inter Arrival Time for Create I/O}
740
\label{fig:PDF-IAT-Create}
741
\end{figure}
742
743
% RTs information
744
745
\begin{figure}
746
\includegraphics[width=0.5\textwidth]{./images/smb_general_rts_cdf.pdf}
747
\caption{CDF of Response Time for General I/O}
748
\label{fig:CDF-RT-General}
749
\vspace{-2em}
750
\end{figure}
751
752
\begin{figure}
753
\includegraphics[width=0.5\textwidth]{./images/smb_general_rts_pdf.pdf}
754
\caption{PDF of Response Time for General I/O}
755
\label{fig:PDF-RT-General}
756
\vspace{-2em}
757
\end{figure}
758
759
\begin{figure}
760
761
\caption{CDF of Response Time for Read I/O}
762
763
\vspace{-2em}
764
\end{figure}
765
766
\begin{figure}
767
768
\caption{PDF of Response Time for Read I/O}
769
770
\vspace{-2em}
771
\end{figure}
772
773
\begin{figure}
774
\includegraphics[width=0.5\textwidth]{./images/smb_write_rts_cdf.pdf}
775
\caption{CDF of Return Time for Write IO}
776
\label{fig:CDF-RT-Write}
777
\vspace{-2em}
778
\end{figure}
779
780
\begin{figure}
781
\includegraphics[width=0.5\textwidth]{./images/smb_write_rts_pdf.pdf}
782
\caption{PDF of Return Time for Write IO}
783
\label{fig:PDF-RT-Write}
784
\vspace{-2em}
785
\end{figure}
786
787
\begin{figure}
788
\includegraphics[width=0.5\textwidth]{./images/smb_create_rts_cdf.pdf}
789
\caption{CDF of Response Time for Create I/O}
790
\label{fig:CDF-RT-Create}
791
\vspace{-2em}
792
\end{figure}
793
794
\begin{figure}
795
\includegraphics[width=0.5\textwidth]{./images/smb_create_rts_pdf.pdf}
796
\caption{PDF of Response Time for Create I/O}
797
\label{fig:PDF-RT-Create}
798
\vspace{-2em}
799
\end{figure}
800
801
%\begin{figure}
802
% \includegraphics[width=0.5\textwidth]{./images/CDF-ioRT-win.pdf}
803
% \caption{CDF of Response Time for Read+Write I/ O}
804
% \label{fig:CDF-RT-RW}
805
%\end{figure}
806
807
%\begin{figure}
808
% \includegraphics[width=0.5\textwidth]{./images/CDF-rBuff-win.pdf}
809
% \caption{CDF of Bytes Transferred for Read IO}
810
811
%\end{figure}
812
813
%\begin{figure}
814
% \includegraphics[width=0.5\textwidth]{./images/CDF-wBuff-win.pdf}
815
% \caption{CDF of Bytes Transferred for Write IO}
816
% \label{fig:CDF-Bytes-Write}
817
%\end{figure}
818
819
%\begin{figure}
820
% \includegraphics[width=0.5\textwidth]{./images/CDF-ioBuff-win.pdf}
821
% \caption{CDF of Bytes Transferred for Read+Write IO}
822
% \label{fig:CDF-Bytes-RW}
823
%\end{figure}
824
825
%Points worth mentioning:
826
%\begin{itemize}
827
% \item Scale of time is only to the microsecond due to the original pcap file capturing process. \texttt{tshark} only captures to a microsecond scale in our implementation.
828
% \item Due to a complication of how DataSeries stores information, there are potentially more SMB2 packets than actually occurred since $0$ is an acceptable command for SMB2 (although not used for SMB).
829
%\end{itemize}
830
831
\subsection{Distribution Models}
832
833
For simulations and analytic modeling, it is often useful to have models that describe the behavior of storage systems I/O. In this section, we attempt to map traditional probabilistic distributions to the data that we have observed.
834
Specifically, taking the developed CDF graphs, we perform curve fitting to determine the applicability of Gaussian and Weibull distributions to the the network filesystem I/O behavior. Note that an exponential distribution, typically used to model interarrival times and response times, is a special case of a Weibull distribution where $k=1$.
835
Table~\ref{tbl:curveFitting} shows best-fit parametrized distributions for the measured. % along with $R^2$ fitness values.
836
837
%Based on the collected IAT and RT data, the following are the best fit curve representation equations with supporting $R^{2}$ values. In the case of each, it was found that the equation used to model the I/O behavior was a Gaussian equation with a single term.
838
%$$f(x) = a_1 * e^{-((x-b_1)/c_1)^2)}$$
839
%The $R^2$ values for each CDF graph were found to be the following:
840
%\begin{itemize}
841
% \item General Command IAT CDF, shown in Figure~\ref{fig:CDF-IAT-General}, had $R^2$ Value of $0.6704$.
842
% \item General Command RT CDF, shown in Figure~\ref{fig:CDF-RT-General}, had $R^2$ Value of $0.9728$.
843
% \item Read command RT CDF, shown in Figure~\ref{fig:CDF-RT-Read}, had $R^2$ Value of $0.7754$.
844
% \item Write command RT CDF, shown in Figure~\ref{fig:CDF-RT-Write}, had $R^2$ Value of $0.7797$
845
% \item Create command RT CDF, shown in Figure~\ref{fig:CDF-RT-Create}, had $R^2$ Value of $0.07146$
846
% \item Read + Write command RT CDF, shown in Figure~\ref{fig:CDF-RT-RW}, has $R^2$ Value of $0.7837$.
847
%\end{itemize}
848
849
\begin{table}
850
\centering
851
\begin{tabular}{|l|c|c|c||c|c|c|}
852
\hline
853
Model & \multicolumn{3}{|c|}{Gaussian}
854
& \multicolumn{3}{|c|}{Weibull} \\ \hline
855
CDF & \multicolumn{3}{|c|}{$\frac{1}{\sqrt{2\pi}}\int_{-\infty}^{\frac{x-\mu}{\sigma}}e^{\frac{-t^2}{2}}dt$}
856
& \multicolumn{3}{|c|}{$1 - e^{(-x/\lambda)^k}$} \\ \hline \hline
857
I/O Operation & $\mu$ & \multicolumn{2}{|c|}{$\sigma$} & $k$ & \multicolumn{2}{|c|}{$\lambda$} \\ \hline
Dec 21, 2019
858
General IAT & 786.72 & \multicolumn{2}{|c|}{10329.6} & 0.9031 & \multicolumn{2}{|c|}{743.2075} \\
859
General RT & 3606.66 & \multicolumn{2}{|c|}{2.74931e+06} & 0.5652 & \multicolumn{2}{|c|}{980.9721} \\
860
Read RT & 44718.5 & \multicolumn{2}{|c|}{1.72776e+07} & 0.0004 & \multicolumn{2}{|c|}{1.5517} \\
861
Read IAT & 24146 & \multicolumn{2}{|c|}{1.189e+07} & 0.0005 & \multicolumn{2}{|c|}{3.8134} \\
862
Write RT & 379.823 & \multicolumn{2}{|c|}{4021.72} & 0.8569 & \multicolumn{2}{|c|}{325.2856} \\
863
Write IAT & 25785.7 & \multicolumn{2}{|c|}{1.22491e+07} & 0.0004 & \multicolumn{2}{|c|}{3.1287} \\
864
Create RT & 502.084 & \multicolumn{2}{|c|}{21678.4} & 0.9840 & \multicolumn{2}{|c|}{496.9497} \\
865
Create IAT & 3694.82 & \multicolumn{2}{|c|}{4.65553e+06} & 0.0008 & \multicolumn{2}{|c|}{2.3504} \\ \hline
866
%R+W RT & \textcolor{red}{0.8045} & \multicolumn{2}{|c|}{\textcolor{red}{0.2122}} & \textcolor{red}{5.103} & \multicolumn{2}{|c|}{\textcolor{red}{0.3937}} \\ \hline
867
%R+W Byte Transfer & \textcolor{red}{0.3744} & \multicolumn{2}{|c|}{\textcolor{red}{0.2983}} & \textcolor{red}{1.153} & \multicolumn{2}{|c|}{\textcolor{red}{0.3937}} \\
868
Read Buff Transfer & 82.9179 & \multicolumn{2}{|c|}{1117.9} & 1.0548 & \multicolumn{2}{|c|}{85.2525} \\
869
Write Buff Transfer & 46.2507 & \multicolumn{2}{|c|}{640.621} & 1.0325 & \multicolumn{2}{|c|}{46.8707} \\ \hline
Apr 16, 2019
870
\end{tabular}
871
\caption{\label{tbl:curveFitting}Comparison of %$R^2$
872
$\mu$, $\sigma$, $k$, and $\lambda$ Values for Curve Fitting Equations on CDF Graphs}
873
\vspace{-3em}
874
\end{table}
875
876
%The graphs created by the dissection script are:
877
%\begin{itemize}
878
% \item Average IAT (G/R/W/C) - By DateTime.
879
% \item Average Bytes (R/W) - By DateTime.
880
% \item Session I/Os (G/R/W/C) - By DateTime.
881
% \item Non-Session I/Os (G/R/W/C) - By DateTime.
882
% \item Tuple Counts - By DateTime.
883
% \item Total Bytes (R+W/R/W) - By DateTime.
884
% \item Total I/Os (G/R/W) - By DateTime.
885
%\end{itemize}
886
887
%Observations on graphs:
888
%\begin{itemize}
889
% \item Avergage IAT - majority write/general.
890
% \item Total I/O - majority are general I/O.
891
% \item Average Bytes - majority are writes.
892
% \item Bytes Total - majority reads.
893
% \item Tuple counts are close to same as session counts.
894
%\end{itemize}
895
896
%Examination of the Response Time (RT) and Inter Arrival Times (IAT) revealed the speed and frequency with which metadata operations are performed, as well as the infrequency of individual users and sessions to interact with a given share.
897
Dec 21, 2019
898
%% NEED: Run the matlab curve fitting to complete this section of the writing
Apr 16, 2019
899
Our comparison of the existing standard use of a exponential distribution to model network interarrival and response times is still valid. One should notice that the Gaussian distributions
900
% had better $R^2$ result than the exponential equivalent for write operations. This is not surprising due to the step-function shape of the Figure~\ref{fig:CDF-RT-Write} CDF. Examining the $R^2$ results for the read + write I/O operations we find that the exponential distribution is far more accurate at modeling this combined behavior.
Dec 21, 2019
901
for write and create operations are similar, while those for read operations are not. Further more there is less similarity between the modeled behavior of general operation inter arrival times and their response times, showing the need for a more refined model for each aspect of the network filesystem interactions.
902
One should also notice that the general operation model is more closely similar to that of the creates.
903
This makes sense since the influence of create operations are found to dominate the I/O behavior of the network filesystem, which aligns well with the number of existing close operations.
904
%improves the ability of a exponential distribution to model the combined behavior.}
Apr 16, 2019
905
%Observations:
906
%\begin{itemize}
907
% \item Byte data appears in powers of 2 (e.g. 32K, 64K)
908
% \item IAT times most occur in the 0-10000 microsecond range, expect to general I/O which is in a much smaller range. The expectation is that this is because some commands and actions in SMB do not require the establishment of a session, thus allowing for a faster response.
909
% \item The timestamps provided by SMB are only accurate to the microseconds.
910
%\end{itemize}
911
%University information:
912
%\begin{itemize}
913
% \item Central backup server where each has a client.
914
% \item Client notifies 50 servers at once to do backup and as finished move onto the next.
915
% \item Only begin during midnight to 4am while servers must be ready to back-up and clients must respond to back-up.
916
% \item The 50 servers are randomized and incremental back-up takes ~1-2 hours
917
%\end{itemize}
918
%\textbf{Note:} Not sure that we would see this traffic since that would be between the servers and the back-up clients, (not the student clients?).
919
%The collected data shows the following observations about the observed network filesystem.
920
%\begin{itemize}
921
% \item The majority of network operations relate to metadata. This is due to a movement for user activity from reading and writing data to simply checking file and directory metadata.
922
% \item Writes cause the largest amount of data to be passed over the network. While Read operations occur at the largest number and cause the larger total number of bytes to be transferred, write operations are more expensive by an order of magnitude.
923
% \item \textcolor{red}{Here will be observation on the modeling of poisson fit.}
924
%\end{itemize}
925
Due to the large number of metadata operations, the use of smart storage solutions could be used to minimize the impact of these I/O. Smart storage elements can aid by performing metadata operations without the need to access persistent storage, thus causing shorter response times. In this manner, the use of smart storage can also help reduce bottlenecks with larger network filesystems and minimize the effect of traffic on overall network performance.
926
927
\subsection{System Limitations and Challenges}
928
\label{System Limitations and Challenges}
929
When initially designing the tracing system used in this paper, different aspects were taken into account, such as space limitations of the tracing system, packet capture limitations (e.g. file size), and speed limitations of the hardware. One limitation encountered in the packet capture system deals with the functional pcap (packet capture file) size. The concern being that the pcap files only need to be held until they have been filtered for specific protocol information and then compressed using the DataSeries format, but still allow for room for the DataSeries files being created to be stored. Other limitation concerns came from the software and packages used to collect the network traffic data~\cite{Orosz2013,dabir2007bottleneck,skopko2012loss}. These ranged from timestamp resolution provided by the tracing system's kernel~\cite{Orosz2013} to how the packet capturing drivers and programs (such as dumpcap and tshark) operate along with how many copies are performed and how often. The speed limitations of the hardware are dictated by the hardware being used (e.g. GB capture interface) and the software that makes use of this hardware (e.g. PF\_RING). After all, our data can only be as accurate as the information being captured~\cite{seltzer2003nfs,anderson2004buttress}.
930
An other concern was whether or not the system would be able to function optimally during periods of high network traffic. All aspects of the system, from the hardware to the software, have been altered to help combat these concerns and allow for the most accurate packet capturing possible.
931
932
933
While the limitations of the system were concerns, there were other challenges that were tackled in the development of this research.
934
One glaring challenge with building this tracing system was using code written by others; tshark \& DataSeries. While these programs are used within the tracing structure there are some issues when working with them. These issues ranged from data type limitations of the code to hash value and checksum miscalculations due to encryption of specific fields/data. Attempt was made to dig and correct these issues, but they were so inherent to the code being worked with that hacks and workarounds were developed to minimize their effect. Other challenges centralize around selection, interpretations and distribution scope of the data collected. Which fields should be filtered out from the original packet capture? What data is most prophetic to the form and function of the network being traced? What should be the scope, with respect to time, of the data being examined? Where will the most interesting information appear? As each obstacle was tackled, new information and ways of examining the data reveal themselves and with each development different alterations \& corrections are made.
935
936
Even when all the information is collected and the most important data has been selected, there is still the issue of what lens should be used to view this information. Because the data being collected is from an active network, there will be differing activity depending on the time of day, week, and scholastic year. For example, although the first week or so of the year may contain a lot of traffic, this does not mean that trends of that period of time will occur for every week of the year (except perhaps the final week of the semester). The trends and habits of the network will change based on the time of year, time of day, and even depend on the exam schedule. Truly interesting examination of data requires looking at all different periods of time to see how all these factors play into the communications of the network.
937
% DataSeries Challenge
938
A complication of this process is that the DataSeries code makes use of a push-pop stack for iterating through packet information. This means that if information can not be re-read then errors occur. This can manifest in the scenario where a produced \texttt{.ds} file is corrupted or incomplete, despite the fact that the original \texttt{.pcap} being fine.
939
%This manifested as an approximate loss of \textbf{????} out of every 100,000 files.
940
Normally, one could simply re-perform the conversion process to a DataSeries file, but due to the rate of the packets being captured and security concerns of the data being captured, we are unable to re-run any captured information.
941
942
\section{Conclusions and Future Work}
Dec 21, 2019
943
Our analysis of this university network filesystem illustrated the current implementation and use of the CIFS/SMB protocol in a large academic setting. We notice the effect of caches on the ability of the filesystem to limit the number of accesses to persistant storage. The effect of enterprise storage disks access time can seen in the response time for read and write I/O. The majority of network communication is dominated by metadata operation, which is of less surprise since SMB is a known chatty protocol. We do notice that the CIFS/SMB protocol continues to be chatty with metadata I/O operations regardless of the version of SMB being implemented; $74.66$\% of I/O being metadata operations for SMB2.
944
We also find that read operations happen in greater number than write operations (at a ratio of 1.06) and the size of their transfers are is also greater by a factor of about 2.
945
However, the average write operation includes a larger number of relatively smaller writes. Examination of the return times for these different I/O operations shows that exponential distribution curve fitting equation is most accurate at modeling the CDF of the various I/O operations. This shows that the current model is still effective for the majority of I/O, but that for read operations there needs to be further research in modeling their behavior.
Apr 16, 2019
946
%Our work finds that a single term Gaussian distribution has an $R^2$ value of $0.7797$, but further work needs to be made in order to refine the model.
Dec 21, 2019
947
Our work finds that write and create response times can be modeled similarly, but that the read response times require the alteration of the general model.
948
However, the general I/O can be modeled using the same standard; which has similar shape and scale to that of the write and create operations.
Apr 16, 2019
949
950
\subsection{Future Work}
951
The analysis work will eventually incorporate oplocks and other aspects of resource sharing on the network to gain a more complete picture of the network's usage and bottlenecks.
952
Network filesystem usage from an individual user scope has become simple and does not contain a greater deal of read, write, and create operations.
Dec 21, 2019
953
Further analysis will be made in examining how the determined metrics change when examined at the scope of a per share (i.e. TID) or per user (i.e. UID). At this level of examination we will be able to obtain a better idea of how each share is interacted with, as well as how files and directories are shared and access control is implemented.
Apr 16, 2019
954
955
%\end{document} % This is where a 'short' article might terminate
956
957
%ACKNOWLEDGMENTS are optional
958
%\section{Acknowledgments}
959
%This section is optional; it is a location for you
960
%to acknowledge grants, funding, editing assistance and
961
%what have you. In the present case, for example, the
962
%authors would like to thank Gerald Murray of ACM for
963
%his help in codifying this \textit{Author's Guide}
964
%and the \textbf{.cls} and \textbf{.tex} files that it describes.
965
966
%
967
% The following two commands are all you need in the
968
% initial runs of your .tex file to
969
% produce the bibliography for the citations in your paper.
970
\balance
971
\bibliographystyle{IEEEtran}
972
\bibliography{sigproc} % sigproc.bib is the name of the Bibliography in this case
973
% You must have a proper ".bib" file
974
% and remember to run:
975
% latex bibtex latex latex
976
% to resolve all references
977
%
978
% ACM needs 'a single self-contained file'!
979
%
980
%APPENDICES are optional
981
%\balancecolumns
982
%\appendix
983
%%Appendix A
984
985
986
%the body of the article are different in the appendices.
987
%In the \textbf{appendix} environment, the command
988
%\textbf{section} is used to
989
%indicate the start of each Appendix, with alphabetic order
990
%designation (i.e. the first is A, the second B, etc.) and
991
%a title (if you include one). So, if you need
992
%hierarchical structure
993
994
%highest level. Here is an outline of the body of this
995
%document in Appendix-appropriate form:
996
%\subsection{Introduction}
997
%\subsection{The Body of the Paper}
998
%\subsubsection{Type Changes and Special Characters}
999
%\subsubsection{Math Equations}
1000
%\paragraph{Inline (In-text) Equations}
1001
%\paragraph{Display Equations}
1002
%\subsubsection{Citations}
1003
%\subsubsection{Tables}
1004
%\subsubsection{Figures}
1005
%\subsubsection{Theorem-like Constructs}
1006
%\subsubsection*{A Caveat for the \TeX\ Expert}
1007
%\subsection{Conclusions}
1008
%\subsection{Acknowledgments}
1009
1010
%This section is inserted by \LaTeX; you do not insert it.
1011
%You just add the names and information in the
1012
1013
%of the document.
1014
%\subsection{References}
1015
%Generated by bibtex from your ~.bib file. Run latex,
1016
%then bibtex, then latex twice (to resolve references)
1017
%to create the ~.bbl file. Insert that ~.bbl file into
1018
%the .tex source file and comment out
1019
%the command \texttt{{\char'134}thebibliography}.
1020
%% This next section command marks the start of
1021
%% Appendix B, and does not continue the present hierarchy
1022
%\section{More Help for the Hardy}
1023
%The sig-alternate.cls file itself is chock-full of succinct
1024