Skip to content
Permalink
Newer
Older
100644 1165 lines (1058 sloc) 91.7 KB
1
% This is "sig-alternate.tex" V2.1 April 2013
2
% This file should be compiled with V2.5 of "sig-alternate.cls" May 2012
3
%
4
% This example file demonstrates the use of the 'sig-alternate.cls'
5
% V2.5 LaTeX2e document class file. It is for those submitting
6
% articles to ACM Conference Proceedings WHO DO NOT WISH TO
7
% STRICTLY ADHERE TO THE SIGS (PUBS-BOARD-ENDORSED) STYLE.
8
% The 'sig-alternate.cls' file will produce a similar-looking,
9
% albeit, 'tighter' paper resulting in, invariably, fewer pages.
10
%
11
% ----------------------------------------------------------------------------------------------------------------
12
% This .tex file (and associated .cls V2.5) produces:
13
% 1) The Permission Statement
14
% 2) The Conference (location) Info information
15
% 3) The Copyright Line with ACM data
16
% 4) NO page numbers
17
%
18
% as against the acm_proc_article-sp.cls file which
19
% DOES NOT produce 1) thru' 3) above.
20
%
21
% Using 'sig-alternate.cls' you have control, however, from within
22
% the source .tex file, over both the CopyrightYear
23
% (defaulted to 200X) and the ACM Copyright Data
24
% (defaulted to X-XXXXX-XX-X/XX/XX).
25
% e.g.
26
% \CopyrightYear{2007} will cause 2007 to appear in the copyright line.
27
% \crdata{0-12345-67-8/90/12} will cause 0-12345-67-8/90/12 to appear in the copyright line.
28
%
29
% ---------------------------------------------------------------------------------------------------------------
30
% This .tex source is an example which *does* use
31
% the .bib file (from which the .bbl file % is produced).
32
% REMEMBER HOWEVER: After having produced the .bbl file,
33
% and prior to final submission, you *NEED* to 'insert'
34
% your .bbl file into your source .tex file so as to provide
35
% ONE 'self-contained' source file.
36
%
37
% ================= IF YOU HAVE QUESTIONS =======================
38
% Questions regarding the SIGS styles, SIGS policies and
39
% procedures, Conferences etc. should be sent to
40
% Adrienne Griscti (griscti@acm.org)
41
%
42
% Technical questions _only_ to
43
% Gerald Murray (murray@hq.acm.org)
44
% ===============================================================
45
%
46
% For tracking purposes - this is V2.0 - May 2012
47
48
\documentclass[conference]{IEEEtran}
49
50
\usepackage{listings} % Include the listings-package
51
\usepackage{color}
52
\usepackage{balance}
53
\usepackage{graphicx}
54
\usepackage{url}
55
\usepackage{tabularx,booktabs}
56
\usepackage{multirow}
57
\usepackage[normalem]{ulem}
58
\useunder{\uline}{\ul}{}
59
60
\definecolor{darkgreen}{rgb}{0,0.5,0}
61
\definecolor{mygreen}{rgb}{0,0.6,0}
62
\definecolor{mygray}{rgb}{0.5,0.5,0.5}
63
\definecolor{mymauve}{rgb}{0.58,0,0.82}
64
\lstset{ %
65
backgroundcolor=\color{white}, % choose the background color; you must add \usepackage{color} or \usepackage{xcolor}
66
basicstyle=\ttfamily\scriptsize, % the size of the fonts that are used for the code
67
breakatwhitespace=false, % sets if automatic breaks should only happen at whitespace
68
breaklines=true, % sets automatic line breaking
69
captionpos=b, % sets the caption-position to bottom
70
commentstyle=\color{mygreen}, % comment style
71
deletekeywords={...}, % if you want to delete keywords from the given language
72
escapeinside={\%*}{*)}, % if you want to add LaTeX within your code
73
extendedchars=true, % lets you use non-ASCII characters; for 8-bits encodings only, does not work with UTF-8
74
frame=single, % adds a frame around the code
75
keepspaces=true, % keeps spaces in text, useful for keeping indentation of code (possibly needs columns=flexible)
76
keywordstyle=\color{blue}, % keyword style
77
% language=C, % the language of the code
78
morecomment=[l]{--},
79
morekeywords={property,set,is,type, constant, enumeration, end, applies, to, inherit, of, *,...}, % if you want to add more keywords to the set
80
numbers=left, % where to put the line-numbers; possible values are (none, left, right)
81
numbersep=5pt, % how far the line-numbers are from the code
82
numberstyle=\tiny\color{mygray}, % the style that is used for the line-numbers
83
rulecolor=\color{black}, % if not set, the frame-color may be changed on line-breaks within not-black text (e.g. comments (green here))
84
showspaces=false, % show spaces everywhere adding particular underscores; it overrides 'showstringspaces'
85
showstringspaces=false, % underline spaces within strings only
86
showtabs=false, % show tabs within strings adding particular underscores
87
stepnumber=1, % the step between two line-numbers. If it's 1, each line will be numbered
88
stringstyle=\color{mymauve}, % string literal style
89
tabsize=2, % sets default tabsize to 2 spaces
90
title=\lstname % show the filename of files included with \lstinputlisting; also try caption instead of title
91
}
92
93
\providecommand{\keywords}[1]{\textbf{\textit{Index terms---}} #1}
94
95
\ifCLASSINFOpdf
96
% \usepackage[pdftex]{graphicx}
97
% declare the path(s) where your graphic files are
98
% \graphicspath{{../pdf/}{../jpeg/}}
99
% and their extensions so you won't have to specify these with
100
% every instance of \includegraphics
101
% \DeclareGraphicsExtensions{.pdf,.jpeg,.png}
102
\else
103
% or other class option (dvipsone, dvipdf, if not using dvips). graphicx
104
% will default to the driver specified in the system graphics.cfg if no
105
% driver is specified.
106
% \usepackage[dvips]{graphicx}
107
% declare the path(s) where your graphic files are
108
% \graphicspath{{../eps/}}
109
% and their extensions so you won't have to specify these with
110
% every instance of \includegraphics
111
% \DeclareGraphicsExtensions{.eps}
112
\fi
113
114
\begin{document}
115
%
116
% paper title
117
% can use linebreaks \\ within to get better formatting as desired
118
\title{A Trace-Based Study of SMB Network File System Workloads in an Academic Enterprise}
119
120
%\author{\IEEEauthorblockN{Paul Wortman and John Chandy}
121
%\IEEEauthorblockA{Department of Electrical and Computer Engineering\\
122
%University of Connecticut, USA\\
123
%(paul.wortman, john.chandy)@uconn.edu
124
%}}
125
126
127
% make the title area
128
\maketitle
129
130
\begin{abstract}
131
Storage system traces are important for examining real-world applications, studying potential bottlenecks, as well as driving benchmarks in the evaluation of new system designs.
132
While file system traces have been well-studied in earlier work, it has been some time since the last examination of the SMB network file system.
133
The purpose of this work is to continue previous SMB studies to better understand the use of the protocol in a real-world production system in use at a major research university. %\textcolor{red}{the University of Connecticut}.
134
The main contribution of our work is the exploration of I/O behavior in modern file system workloads as well as new examinations of the inter-arrival times and run times for I/O events.
135
We further investigate if the recent standard models for traffic remain accurate.
136
Our findings reveal interesting data relating to the number of read and write events. We notice that the number of read and write events is significantly less than creates and the average number of bytes exchanged per I/O is much smaller than what has been seen in previous studies.
137
%the average of bytes transferred over the wire is much smaller than what has been seen in previous studies.
138
Furthermore we find an increase in the use of metadata for overall network communication that can be taken advantage of through the use of smart storage devices.
139
\keywords{Server Message Block,
140
Network Benchmark, Storage Systems, Distributed I/O, Network Communication Analysis.}
141
\end{abstract}
142
143
\section{Introduction}
144
%Mention:
145
%\begin{itemize}
146
% \item Why is it important to re-examine the SMB protocol?
147
% \item Why does examination of network use matter?
148
% \item Need to ensure hash of data and not saving any of the original traffic packets.
149
%\end{itemize}
150
Over the last twenty years, data storage provisioning has been centralized through the
151
use of network file systems. The architectures of these storage systems can vary from
152
storage area networks (SAN), network attached storage (NAS), clustered file systems,
153
hybrid storage, amongst others. However, the front-end client-facing network file
154
system protocol in most enterprise IT settings tends to be, for the most part, solely
155
SMB (Server Message Block) because of the preponderance of MS Windows clients.
156
While there are other network file systems such as Network File System (NFS) and
157
clustered file systems such as Ceph, PanFS, and OrangeFS, they tend to be used less
158
extensively in most non-research networks.
159
160
In spite of the prevalence of SMB usage within most enterprise networks, there has
161
been very little analysis of SMB workloads in prior academic research. The last major study
Jan 16, 2020
162
of SMB was more than a decade ago~\cite{leung2008measurement}, and the nature of storage
163
usage has changed dramatically over the last decade.
Jan 16, 2020
164
It is always important to revisit commonly used protocols to examine their use in comparison to the expected use case(s). This is doubly so for network communications because the nuances of networked data exchange can greatly influence the effectiveness and efficiency of a chosen protocol.
165
Since an SMB-based trace study has not been undertaken
166
recently, we took a look at its current implementation and use in a large university network.
167
%Due to the sensitivity of the captured information, we ensure that all sensitive information is hashed and that the original network captures are not saved.
168
169
Our study is based on network packet traces collected on a major research university's
Apr 22, 2020
170
%\textcolor{red}{the University of Connecticut}'s
171
centralized storage facility over a period of three weeks in May 2019. This trace-driven analysis can help in the design of future storage products as well as providing data for future performance benchmarks.
172
%Benchmarks are important for the purpose of developing technologies as well as taking accurate metrics. The reasoning behind this tracing capture work is to eventually better develop accurate benchmarks for network protocol evaluation.
Jan 16, 2020
173
Benchmarks allow for the stress testing of various aspects of a system (e.g. network, single system). Aggregate data analysis collected from traces can lead to the development of synthetic benchmarks. Traces can also expose systems patterns that can also be reflected in synthetic benchmarks. Finally, the traces themselves can drive system simulations that can be used to evaluate prospective storage architectures.
174
175
%\begin{itemize}
176
% \item \textbf{Why?:} Benchmarks allow for the stress testing of different/all aspects of a system (e.g. network, single system).
177
% \item \textbf{How:} There are three ``steps'' to creating a benchmark.
178
% \begin{enumerate}
179
% \item Take a trace of an existing system
180
% \begin{itemize}
181
% \item This is important because this information is how one is able to compare the expected actions of a system (theory) against the traced actions (practice) of the system. Leads to the development of later synthetic benchmarks.
182
% \end{itemize}
183
% \item Determine which aspects of the trace of said system (in an educated arbitrary way) are most representative of ``what occurred'' during the tracing of the system. Leads to discovery of habits/patterns of the system; which is later used for synthetic benchmark.
184
% \item Use discovered information to produce benchmark
185
% \begin{itemize}
186
% \item Done by either running a repeat of the trace of synthetic benchmark created using trends from trace.
187
% \end{itemize}
188
% \end{enumerate}
189
%\end{itemize}
190
191
We created a new tracing system to collect data from the university
Apr 22, 2020
192
%\textcolor{red}{UConn}
193
storage network system. The tracing system was built around the high-speed PF\_RING packet capture system and required the use of proper hardware and software to handle incoming data%\textcolor{blue}{; however interaction with later third-party code did require re-design for processing of the information}
194
. We also created a new trace capture format based on the DataSeries structured data format developed by HP~\cite{DataSeries}.
195
% PF\_RING section
196
%The addition of PF\_RING lends to the tracing system by minimizing the copying of packets which, in turn, allows for more accurate timestamping of incoming traffic packets being captured ~\cite{Orosz2013,skopko2012loss,pfringWebsite,PFRINGMan}.
197
PF\_RING acts as a kernel module that aids in minimizing packet loss/timestamping issues by not passing packets through the kernel data structures~\cite{PFRINGMan}.
198
%The other reason PF\_RING is instrumental is that it functions with the 10Gb/s hardware that was installed into the Trace1 server; allowing for full throughput from the network tap on the UITS system. \\
199
% DataSeries + Code section
Jan 16, 2020
200
DataSeries was modified to filter specific SMB protocol fields along with the writing of analysis tools to parse and dissect the captured packets. Specific fields were chosen to be the interesting fields kept for analysis.
201
%It should be noted that this was done originally arbitrarily and changes/additions have been made as the value of certain fields were determined to be worth examining; e.g. multiple runs were required to refine the captured data for later analysis.
202
The DataSeries data format allowed us to create data analysis code that focuses on I/O events and ID tracking (TID/UID). The future vision for this information is to combine ID tracking with the OpLock information in order to track resource sharing of the different clients on the network. As well as using IP information to recreate communication in a larger network trace to establish a better benchmark.
203
204
%Focus should be aboiut analysis and new traces
205
The contributions of this work are the new traces of SMB traffic over a large university network as well as new analysis of this traffic. Our new examination of the captured data reveals that despite the streamlining of the CIFS/SMB protocol to be less "chatty", the majority of SMB communication is still metadata based I/O rather than actual data I/O. We found that read operations occur in greater numbers and cause a larger overall number of bytes to pass over the network. Additionally, the average number of bytes transferred for each write I/O is smaller than that of the average read operation. We also find that the current standard for modeling network I/O holds for the majority of operations, while a more representative model needs to be developed for reads.
Duncan
Apr 23, 2020
207
%\textcolor{red}{Add information about releasing the code?}
208
209
\section{Related Work}
210
\begin{table*}[h]
211
\centering
212
\begin{tabular}{|r|c|c|c|c|c|}
213
\hline
214
Study & Date of Traces & FS/Protocol & Network FS & Trace Approach & Workload \\ \hline
215
Ousterhout, \textit{et al.}~\cite{ousterhout1985trace} & 1985 & BSD & & Dynamic & Engineering \\ \hline
216
Ramakrishnan, \textit{et al.}~\cite{ramakrishnan1992analysis} & 1988-89 & VAX/VMS & x & Dynamic & Engineering, HPC, Corporate \\ \hline
217
Baker, \textit{et al.}~\cite{baker1991measurements} & 1991 & Sprite & x & Dynamic & Engineering \\ \hline
218
Gribble, \textit{et al.}~\cite{gribble1996self} & 1991-97 & Sprite, NFS, VxFS & x & Both & Engineering, Backup \\ \hline
219
Douceur and Bolosky~\cite{douceur1999large} & 1998 & FAT, FAT32, NTFS & & Snapshots & Engineering \\ \hline
220
Vogels~\cite{vogels1999file} & 1998 & FAT, NTFS & & Both & Engineering, HPC \\ \hline
221
Zhou and Smith~\cite{zhou1999analysis} & 1999 & VFAT & & Dynamic & PC \\ \hline
222
Roselli, \textit{et al.}~\cite{roselli2000comparison} & 1997-00 & VxFS, NTFS & & Dynamic & Engineering, Server \\ \hline
223
Malkani, \textit{et al.}~\cite{malkani2003passive} & 2001 & NFS & x & Dynamic & Engineering, Email \\ \hline
224
Agrawal, \textit{et al.}~\cite{agrawal2007five} & 2000-2004 & FAT, FAT32, NTFS & & Snapshots & Engineering \\ \hline
225
Leung, \textit{et al.}~\cite{leung2008measurement} & 2007 & CIFS & x & Dynamic & Corporate, Engineering \\ \hline
226
%Traeger, \textit{et al.}~\cite{traeger2008nine} & 2008 & FUSE & x & Snapshots & Backup \\ \hline
227
Vrable, \textit{et al.}~\cite{vrable2009cumulus} & 2009 & FUSE & x & Snapshots & Backup \\ \hline
228
Benson, \textit{et al.}~\cite{benson2010network} & 2010 & AFS, MapReduce, NCP, SMB & x & Dynamic & Academic, Corporate \\ \hline
229
Chen, \textit{et al.}~\cite{chen2012interactive} & 2012 & MapReduce & x & Dynamic & Corporate \\ \hline
Jan 16, 2020
230
This paper & 2020 & SMB & x & Dynamic & Academic, Engineering, Backup \\ \hline
231
\end{tabular}
232
\caption{Summary of major file system studies over the past decades. For each study the tables shows the dates of the trace data, the file system or protocol studied, whether it involved network file systems, the trace methodology used, and the workloads studied. Dynamic trace studies are those that involve traces of live requests. Snapshot studies involve snapshots of file system contents.}
233
\label{tbl:studySummary}
234
\vspace{-2em}
Jan 16, 2020
235
\end{table*}
236
\label{Previous Advances Due to Testing}
237
%In this section we discuss previous studies examining traces and testing that has advanced benchmark development.
238
We summarize major works in trace study in Table~\ref{tbl:studySummary}.
239
%In addition we examine issues that occur with traces and the assumptions in their study.
240
Tracing collection and analysis from previous studies have provided important insights and lessons such as an observations of read/write event changes, overhead concerns originating in system implementation, bottlenecks in communication, and other revelations found in the traces.
241
Previous tracing work has shown that one of the largest and broadest hurdles to tackle is that traces (and benchmarks) must be tailored to the system being tested. There are always some generalizations taken into account but these generalizations can also be a major source of error (e.g. timing, accuracy, resource usage) ~\cite{vogels1999file,malkani2003passive,seltzer2003nfs,anderson2004buttress,Orosz2013,dabir2007bottleneck,skopko2012loss,traeger2008nine,ruemmler1992unix}.
242
To produce a benchmark with high fidelity one needs to understand not only the technology being used but how it is being implemented within the system~\cite{roselli2000comparison,traeger2008nine,ruemmler1992unix}. All these aspects lend to the behavior of the system; from timing and resource elements to how the managing software governs actions~\cite{douceur1999large,malkani2003passive,seltzer2003nfs}. Furthermore, in pursuing this work one may find unexpected results and learn new things through examination~\cite{leung2008measurement,roselli2000comparison,seltzer2003nfs}.
243
These studies are required in order to evaluate the development of technologies and methodologies along with furthering knowledge of different system aspects and capabilities. As has been pointed out by past work, the design of systems is usually guided by an understanding of the file system workloads and user behavior~\cite{leung2008measurement}.
244
%It is for that reason that new studies are constantly performed by the science community, from large scale studies to individual protocol studies~\cite{leung2008measurement,vogels1999file,roselli2000comparison,seltzer2003nfs,anderson2004buttress}. Even within these studies, the information gleaned is only as meaningful as the considerations of how the data is handled.
245
246
%The work done by
247
Leung et al.~\cite{leung2008measurement} found that
248
%observations related to the infrequency of files to be shared by more than one client.
249
over 67\% of files were never opened by more than one client.
250
%Work by Leung \textit{et al.} led to a series of observations, from the fact that files are rarely re-opened to finding
251
and that read-write access patterns are more frequent ~\cite{leung2008measurement}.
252
%If files were shared it was rarely concurrently and usually as read-only; where 5\% of files were opened by multiple clients concurrently and 90\% of the file sharing was read only.
253
%Concerns of the accuracy achieved of the trace data was due to using standard system calls as well as errors in issuing I/Os leading to substantial I/O statistical errors.
254
% Anderson Paper
255
%The 2004 paper by
256
Anderson et al.~~\cite{anderson2004buttress} found that a
257
%has the following observations. A
258
source of decreased precision came from the kernel overhead for providing timestamp resolution. This would introduce substantial errors in the observed system metrics due to the use inaccurate tools when benchmarking I/O systems. These errors in perceived I/O response times can range from +350\% to -15\%.
259
%I/O benchmarking widespread practice in storage industry and serves as basis for purchasing decisions, performance tuning studies and marketing campaigns.
260
Issues of inaccuracies in scheduling I/O can result in as much as a factor 3.5 difference in measured response time and factor of 26 in measured queue sizes. These inaccuracies pose too much of an issue to ignore.
261
262
Orosz and Skopko examined the effect of the kernel on packet loss and
263
%in their 2013 paper~\cite{Orosz2013}. Their work
264
showed that when taking network measurements the precision of the timestamping of packets is a more important criterion than low clock offset, especially when measuring packet inter-arrival times and round-trip delays at a single point of the network. One solution for network capture is the tool Dumpcap. However the concern with Dumpcap is that it is a single threaded application and was suspected to be unable to handle new arriving packets due to the small size of the kernel buffer. Work by
265
Dabir and Matrawy%, in 2008
266
~\cite{dabir2007bottleneck} attempted to overcome this limitation by using two semaphores to buffer incoming strings and improve the writing of packet information to disk.
267
%Narayan and Chandy examined the concerns of distributed I/O and the different models of parallel application I/O.
268
%There are five major models of parallel application I/O. (1) Single output file shared by multiple nodes. (2) Large sequential reads by a single node at the beginning of computation and large sequential writes by a single node at the end of computation. (3) Checkpointing of states. (4) Metadata and read intensive (e.g. small data I/O and frequent directory lookups for reads).
269
%Due to the striping of files across multiple nodes, this can cause any read or write to access all the nodes; which does not decrease the inter-arrival times (IATs) seen. As the number of I/O operations increases and the number of nodes increases, the IAT times decreased.
270
%Observations from
271
Skopk\'o
272
%in a 2012 paper
273
~\cite{skopko2012loss} examined the concerns of software based capture solutions and observed that
274
%. The main observation was
275
software solutions relied heavily on OS packet processing mechanisms. Furthermore, depending on the mode of operation (e.g. interrupt or polling), the timestamping of packets would change.
Jan 16, 2020
277
As seen in previous trace work~\cite{leung2008measurement,roselli2000comparison,seltzer2003nfs}, the general perceptions of how computer systems are being used versus their initial purpose have allowed for great strides in eliminating actual bottlenecks rather than spending unnecessary time working on imagined bottlenecks. Without illumination of these underlying actions (e.g. read-write ratios, file death rates, file access rates) these issues can not be readily tackled.
279
280
\section{Background}
Duncan
Apr 23, 2020
281
%\subsection{Server Message Block}
282
The Server Message Block (SMB) is an application-layer network protocol mainly used for providing shared access to files, shared access to printers, shared access to serial ports, miscellaneous communications between nodes on the network, as well as providing an authenticated inter-process communication mechanism.
283
%The majority of usage for the SMB protocol involves Microsfot Windows. Almost all implementations of SMB servers use NT Domain authentication to validate user-access to resources
284
The SMB 1.0 protocol~\cite{SMB1Spec} has been found to have high/significant impact on performance due to latency issues. Monitoring revealed a high degree of ``chattiness'' and disregard of network latency between hosts. Solutions to this problem were included in the updated SMB 2.0 protocol which decreases ``chattiness'' by reducing commands and sub-commands from over a hundred to nineteen~\cite{SMB2Spec}. Additional changes, most significantly increased security, were implemented in the SMB 3.0 protocol (previously named SMB 2.2). % XXX citations for SMB specs for different versions?
285
%\textcolor{red}{\textbf{Add information about SMB 2.X/3?}}
286
287
\begin{figure*}[ht!]
288
\includegraphics[width=\textwidth]{./images/packetcapturetopology.png}
289
\caption{Visualization of Packet Capturing System}
290
\label{fig:captureTopology}
291
\end{figure*}
292
293
294
The rough order of communication for SMB session file interaction contains five steps. First is a negotiation where a Microsoft SMB Protocol dialect is determined. Next, a session is established to determine the share-level security. After this, the Tree ID (TID) is determined for the share to be connected to as well as a file ID (FID) for a file requested by the client. From this establishment, I/O operations are performed using the FID given in the previous step. %\textcolor{green}{The SMB packet header is shown in Figure~\ref{fig:smbPacket}.}
295
296
% Information relating to the capturing of SMB information
Jan 16, 2020
297
The only data that needs to be tracked from the SMB traces are the UID (User ID) and TID for each session. The SMB commands also include a MID (Multiplex ID) value that is used for tracking individual packets in each established session, and a PID (Process ID) that tracks the process running the command or series of commands on a host.
298
For the purposes of our tracing, we do not track the MID or PID information.
Duncan
Feb 2, 2020
299
%
300
Some nuances of the SMB protocol I/O to note are that SMB/SMB2 write requests are the actions that push bytes over the wire while for SMB/SMB2 read operations it is the response packets.
301
302
303
%\begin{itemize}
304
% \item SMB/SMB2 write request is the command that pushes bytes over the wire. \textbf{Note:} the response packet only confirms their arrival and use (e.g. writing).
305
% \item SMB/SMB2 read response is the command that pushes bytes over the wire. \textbf{Note:} The request packet only asks for the data.
306
%\end{itemize}
307
% Make sure to detail here how exactly IAT/RT are each calculated
308
Duncan
Apr 23, 2020
309
%\textcolor{red}{Add writing about the type of packets used by SMB. Include information about the response time of R/W/C/General (to introduce them formally; not sure what this means.... Also can bring up the relation between close and other requests.}
311
%\textcolor{blue}{It is worth noting that for the SMB2 protocol, the close request packet is used by clients to close instances of file that \textcolor{green}{were opened} with a previous create request packet.}
Duncan
Apr 23, 2020
312
313
%\begin{figure}
314
% \includegraphics[width=0.5\textwidth]{./images/smbPacket.jpg}
315
% \caption{SMB Packet \textcolor{green}{Header Format}}
Duncan
Apr 23, 2020
316
% \label{fig:smbPacket}
317
%\end{figure}
318
319
%\subsection{Issues with Tracing}
320
%\label{Issues with Tracing}
321
%There are three general approaches to creating a benchmark based on a trade-off between experimental complexity and resemblance to the original application. (1) Connect the system to a production test environment, run the application, and measure the application metrics. (2) Collect traces from running the application and replay them (after possible modification) back on the test I/O system. (3) Generate a synthetic workload and measure the system performance.
322
%
323
%The majority of benchmarks attempt to represent a known system and structure on which some ``original'' design/system was tested. While this is all well and good, there are many issues with this sort of approach; temporal and spatial scaling concerns, timestamping and buffer copying, as well as driver operation for capturing packets~\cite{Orosz2013,dabir2007bottleneck,skopko2012loss}. Each of these aspects contribute to the initial problems with dissection and analysis of the captured information. For example, inaccuracies in scheduling I/Os may result in as much as a factor of 3.5 differences in measured response time and factor of 26 in measured queue sizes; differences that are too large to ignore~\cite{anderson2004buttress}.
324
%Dealing with timing accuracy and high throughput involves three challenges. (1) Designing for dealing with peak performance requirements. (2) Coping with OS timing inaccuracies. (3) Working around unpredictable OS behavior; e.g. mechanisms to keep time and issue I/Os or performance effects due to interrupts.
325
%
326
%Temporal scaling refers to the need to account for the nuances of timing with respect to the run time of commands; consisting of computation, communication and service. A temporally scalable benchmarking system would take these subtleties into account when expanding its operation across multiple machines in a network. While these temporal issues have been tackled for a single processor (and even somewhat for cases of multi-processor), these same timing issues are not properly handled when dealing with inter-network communication. Inaccuracies in packet timestamping can be caused due to overhead in generic kernel-time based solutions, as well as use of the kernel data structures ~\cite{PFRINGMan,Orosz2013}.
327
328
%Spatial scaling refers to the need to account for the nuances of expanding a benchmark to incorporate a number of machines over a network. A system that properly incorporates spatial scaling is one that would be able to incorporate communication (even in varying intensities) between all the machines on a system, thus stress testing all communicative actions and aspects (e.g. resource locks, queueing) on the network.
329
330
\section{Packet Capturing System}
331
%In this section, we describe the packet capturing system as well as decisions made that influence its capabilities. We illustrate the existing university network filesystem as well as our methods for ensuring high-speed packet capture. Then, we discuss the analysis code we developed for examining the captured data.
332
% and on the python dissection code we wrote for performing traffic analysis.
333
Jan 16, 2020
334
335
\subsection{University Storage System Overview}
336
We collected traces from the university
Apr 22, 2020
337
%\textcolor{red}{the University of Connecticut University Information Technology Services (UITS)}
338
centralized storage server%The \textcolor{red}{UITS system}
339
, which consists of five Microsoft file server cluster nodes. These blade servers are used to host SMB file shares for various departments at
340
the university
Apr 22, 2020
341
%\textcolor{red}{UConn}
342
as well as personal drive share space for faculty, staff and students, along with at least one small group of users. Each server is capable of handling 1~Gb/s of traffic in each direction (e.g. outbound and inbound traffic). Altogether, the five-blade server system can in theory handle 5~Gb/s of data traffic in each direction.
343
%Some of these blade servers have local storage but the majority do not have any.
344
The blade servers serve as SMB heads, but the actual storage is served by SAN storage nodes that sit behind them. This system does not currently implement load balancing. Instead, the servers are set up to spread the load with a static distribution across four of the active cluster nodes while the passive fifth node takes over in the case any of the other nodes go down.% (e.g. become inoperable or crash).
346
The actual tracing was performed with a tracing server connected to a switch outfitted with a packet duplicating element as shown in the topology diagram in Figure~\ref{fig:captureTopology}. A 10~Gbps network tap was installed in the file server switch, allowing our storage server to obtain a copy of all network traffic going to the 5 file servers. The reason for using 10~Gbps hardware is to help ensure that the system is able to capture information on the network at peak theoretical throughput.
347
348
\subsection{High-speed Packet Capture}
349
\label{Capture}
350
%The packet capturing aspect of the tracing system is fairly straight forward.
351
%On top of the previously mentioned alterations to the system (e.g. PF\_RING), the capture of packets is done through the use of \textit{tshark}, \textit{pcap2ds}, and \textit{inotify} programs.
352
%The broad strokes are that incoming SMB/CIFS information comes from the university's network. All packet and transaction information is passed through a duplicating switch that then allows for the tracing system to capture these packet transactions over a 10 Gb port. These packets are
353
%passed along to the \textit{tshark} packet collection program which records these packets into a cyclical capturing ring. A watchdog program (\textit{inotify}) watches the directory where all of these packet-capture (pcap) files are being stored. As a new pcap file is completed \textit{inotify} passes the file to \textit{pcap2ds} along with what protocol is being examined (i.e. SMB). The \textit{pcap2ds} program reads through the given pcap files,
354
Jan 16, 2020
355
In order to maximize our faithful capture of the constant rate of traffic, we implement on the tracing server an ntop~\cite{ntopWebsite} solution called PF\_RING~\cite{pfringWebsite} to dramatically improve the storage server's packet capture speed.
356
%A license was obtained for scholastic use of PF\_RING. PF\_RING implements a ring buffer to provide fast and efficient packet capturing. Having implemented PF\_RING, the next step was to
Jan 16, 2020
357
We had to tune an implementation of \texttt{tshark} (wireshark's terminal pcap implementation) to maximize the packet capture rate.
358
%and dissection into the DataSeries format~\cite{dataseriesGit}.
359
%The assumption being made is that PF\_RING tackles and takes care of the concerns of packets loss due to buffer size, storage, and writing. \textit{tshark} need only read in those packets and generate the necessary DataSeries (ds) files.
Jan 16, 2020
360
\texttt{tshark} outputs \texttt{.pcap} files which captures all of the data present in packets on the network. We configure \texttt{tshark} so that it only captures SMB packets. Furthermore, to optimize this step, a capture ring buffer flag is used to minimize the amount of space used to write \texttt{.pcap} files, while optimizing the amount of time to
361
%\textit{pcap2ds} can
Jan 16, 2020
362
filter data from the \texttt{.pcap} files.
363
The filesize used was in a ring buffer where each file captured was 64000 kB.
364
% This causes tshark to switch to the next file after it reaches a determined size.
365
%To simplify this aspect of the capturing process, the entirety of the capturing, dissection, and permanent storage was all automated through watch-dog scripts.
Jan 16, 2020
366
367
The \texttt{.pcap} files from \texttt{tshark} do not lend themselves to easy data analysis, so we translate these files into the DataSeries~\cite{DataSeries} format, an XML-based structured data format designed to be self-descriptive, storage and access efficient, and highly flexible.
368
%The system for taking captured \texttt{.pcap} files and writing them into the DataSeries format (i.e. \texttt{.ds}) does so by first creating a structure (based on a pre-written determination of the data desired to capture). Once the code builds this structure, it then reads through the capture traffic packets while dissecting and filling in the prepared structure with the desired information and format.
369
Due to the fundamental nature of this work, there is no need to track every piece of information that is exchanged, only that information which illuminates the behavior of the clients and servers that interact over the network (i.e. I/O transactions). It should also be noted that all sensitive information being captured by the tracing system is hashed to protect the users whose information is examined by the tracing system. Furthermore, the DataSeries file retains only the first 512 bytes of the SMB packet - enough to capture the SMB header information that contains the I/O information we seek, while the body of the SMB traffic is not retained in order to better ensure the privacy of the university's network communications. The reasoning for this limit was to allow for capture of longer SMB AndX message chains due to negotiated \textit{MaxBufferSize}. It is worth noting that in the case of larger SMB headers, some information is lost, however this is a trade-off by the university to provide, on average, the correct sized SMB header but does lead to scenarios where some information may be captured incompletely. This scenario only occurs in the cases of large AndX Chains in the SMB protocol, since the SMB header for SMB 2 is fixed at 72 bytes. In those scenarios the AndX messages specify only a single SMB header with the rest of the AndX Chain attached in a series of block pairs.
370
371
\subsection{DataSeries Analysis}
372
Jan 16, 2020
373
Building upon existing code for the interpretation and dissection of the captured \texttt{.ds} files, we developed C/C++ code for examining the captured traffic information. From this analysis, we are able to capture read, write, create and general I/O information at both a global scale and individual tracking ID (UID/TID) level. In addition, read and write buffer size information is tracked, as well as the inter-arrival and response times. Also included in this data is oplock information and IP addresses. The main contribution of this step is to aggregate seen information for later interpretation of the results.
374
This step also creates an easily digestible output that can be used to re-create all tuple information for SMB/SMB2 sessions that are witnessed over the entire time period.
375
Sessions are any communication where a valid UID and TID is used.
376
377
%\textcolor{red}{Add information about if the code will be publically shared?}
378
379
%\subsection{Python Dissection}
380
%The final step of our SMB/SMB2 traffic analysis system is the dissection of the \texttt{AnalysisModule} output using the pandas data analysis library~\cite{pandasPythonWebsite}. The pandas library is a python implementation similar to R. In this section of the analysis structure, the generated text file is tokenized and placed into specific DataFrames representing the data seen for each 15 minute period. The python code is used for the analysis and further dissection of the data. This is where the cumulative distribution frequency and graphing of collected data is performed. Basic analysis and aggregation is also performed in this part of the code. This analysis includes the summation of individual session I/O (e.g. reads, writes, creates) as well as the collection of inter arrival time data and response time data.
381
382
\section{Data Analysis}
383
\label{sec:data-analysis}
384
385
\begin{table}[]
386
\centering
387
\begin{tabular}{|l|l|}
388
\hline
389
% & Academic Engineering \\ \hline
390
%Maximum Tuples in 15-min Window & 36 \\ %\hline
391
%Total Tuples Seen & 2721 \\ \hline
392
%\textcolor{red}{Maximum Sessions in 15-min Window} & 35 \\ %\hline
393
%Maximum Non-Session in 15-min Window & 2 \\ \hline
394
Total Days & 21 \\ %\hline
395
Total Sessions & 2,413,589 \\ %\hline
396
%Total Non-Sessions & 279006484 \\ \hline
397
Number of SMB Operations & 281,419,686 (100\%)\\ %\hline
398
Number of General SMB Operations & 210,705,867 (74.87\%) \\ %\hline
399
Number of Creates & 54,486,043 (19.36\%) \\ %\hline
400
Number of Read I/Os & 8,355,557 (2.97\%) \\ %\hline
401
Number of Write I/Os & 7,872,219 (2.80\%) \\ %\hline
402
R:W I/O Ratio & 1.06 \\ \hline
403
Total Data Read (GB) & 0.97 \\ %\hline
404
Total Data Written (GB) & 0.6 \\ %\hline
405
Average Read Size (B) & 144 \\ %\hline
406
Average Write Size (B) & 63 \\ \hline
407
%Percentage of Read Bytes of Total Data & 99.4\% \\ %\hline
408
%Percentage of Written Bytes of Total Data & 0.6\% \\ %\hline
409
%Total R:W Byte Ratio & 166.446693744549 \\ %\hline
410
%Average R:W Byte Ratio & 0.253996031053668 \\ \hline
411
\end{tabular}
Duncan
Feb 2, 2020
412
\caption{\label{tbl:TraceSummaryTotal}Summary of Trace I/O Statistics for the time of April 30th, 2019 to May 20th, 2019}
413
\vspace{-2em}
414
\end{table}
415
% NOTE: Not sure but this reference keeps referencing the WRONG table
416
417
Table~\ref{tbl:TraceSummaryTotal}
Apr 22, 2020
418
show a summary of the SMB traffic captured, statistics of the I/O operations, and read/write data exchange observed for the network filesystem. This information is further detailed in Table~\ref{tbl:SMBCommands}, which illustrates that the majority of I/O operations are general (74.87\%). As shown in %the bottom part of
419
Table~\ref{tbl:SMBCommands2}, general I/O includes metadata commands such as connect, close, query info, etc.
Jan 16, 2020
421
Our examination of the collected network filesystem data revealed interesting patterns for the current use of CIFS/SMB in a large engineering academic setting. The first is that there is a major shift away from read and write operations towards more metadata-based ones. This matches the last CIFS observations made by Leung et.~al.~that files were being generated and accessed infrequently. The change in operations are due to a movement of use activity from reading and writing data to simply checking file and directory metadata. However, since the earlier study, SMB has transitioned to the SMB2 protocol which was supposed to be less "chatty" and thus we would expect fewer general SMB operations. Table~\ref{tbl:SMBCommands} shows a breakdown of SMB and SMB2 usage over the time period of May. From this table, one can see that the SMB2 protocol makes up $99.14$\% of total network operations compared to just $0.86$\% for SMB, indicating that most clients have upgraded to SMB2. However, $74.66$\% of SMB2 I/O are still general operations. Contrary to the purpose of implementing the SMB2 protocol, there is still a large amount of general I/O.
422
%While CIFS/SMB protocol has less metadata operations, this is due to a depreciation of the SMB protocol commands, therefore we would expect to see less total operations (e.g. $0.04$\% of total operations).
423
%The infrequency of file activity is further strengthened by our finding that within a week long window of time there are no Read or Write inter arrival times that can be calculated.
424
%\textcolor{red}{XXX we are going to get questioned on this. its not likely that there are no IATs for reads and writes}
425
%General operations happen at very high frequency with inter arrival times that were found to be relatively short (1317$\mu$s on average), as shown in Table~\ref{tbl:PercentageTraceSummary}.
Apr 22, 2020
427
Taking a deeper look at the SMB2 operations, shown in %the bottom half of
428
Table~\ref{tbl:SMBCommands2}, we see that $9.06$\% of the general operations are negotiate commands. These are commands sent by the client to notify the server which dialects of the SMB2 protocol the client can understand. The three most common commands are close, tree connect, and query info.
429
The latter two relate to metadata information of shares and files accessed. However, the close operation corresponds to the create operations. Note that the create command is also used as an open file. Notice is that the number of closes is greater than the total number of create operations by $9.35$\%. These extra close operations are most likely due to applications doing multiple closes that do not need to be performed.
430
431
\begin{table}
432
\centering
433
\begin{tabular}{|l|c|c|c|}
434
\hline
435
I/O Operation & SMB & SMB2 & Both \\ \hline
436
General Operations & 2,418,980 & 208,286,887 & 210,705,867 \\
437
General \% & 99.91\% & 74.66\% & 74.87\% \\ %\hline
438
Create Operations & 0 & 54,486,043 & 54,486,043 \\
439
Create \% & 0.00\% & 19.53\% & 19.36\% \\
440
Read Operations & 1,931 & 8,353,626 & 8,355,557 \\
441
Read \% & 0.08\% & 2.99\%& 2.97\%\\
442
Write Operations & 303 & 7,871,916 & 7,872,219 \\
443
Write \% & 0.01\% & 2.82\% & 2.80\% \\ \hline
444
Combine Protocol Operations & 2,421,214 & 278,998,472 & 281,419,686 \\
445
Combined Protocols \% & 0.86\% & 99.14\% & 100\% \\ \hline
Apr 22, 2020
446
\end{tabular}
447
\caption{\label{tbl:SMBCommands}Percentage of SMB and SMB2 Protocol Commands on March 15th}
448
\end{table}
449
450
\begin{table}[]
451
\centering
452
\begin{tabular}{|l|c|c|c|}
453
\hline
454
SMB2 General Operation & \multicolumn{2}{|c|}{Occurrences} & Percentage of Total \\ \hline
455
Close & \multicolumn{2}{|c|}{80,114,256} & 28.71\% \\
456
Tree Connect & \multicolumn{2}{|c|}{48,414,491} & 17.35\% \\
457
Query Info & \multicolumn{2}{|c|}{27,155,528} & 9.73\% \\
458
Negotiate & \multicolumn{2}{|c|}{25,276,447} & 9.06\% \\
459
Tree Disconnect & \multicolumn{2}{|c|}{9,773,361} & 3.5\% \\
460
IOCtl & \multicolumn{2}{|c|}{4,475,494} & 1.6\% \\
461
Set Info & \multicolumn{2}{|c|}{4,447,218} & 1.59\% \\
462
Query Directory & \multicolumn{2}{|c|}{3,443,491} & 1.23\% \\
463
Session Setup & \multicolumn{2}{|c|}{2,041,208} & 0.73\%\\
464
Lock & \multicolumn{2}{|c|}{1,389,250} & 0.5\% \\
465
Flush & \multicolumn{2}{|c|}{972,790} & 0.35\% \\
466
Change Notify & \multicolumn{2}{|c|}{612,850} & 0.22\% \\
467
Logoff & \multicolumn{2}{|c|}{143,592} & 0.05\% \\
468
Oplock Break & \multicolumn{2}{|c|}{22,397} & 0.008\% \\
469
Echo & \multicolumn{2}{|c|}{4,715} & 0.002\% \\
470
Cancel & \multicolumn{2}{|c|}{0} & 0.00\% \\
471
\hline
472
\end{tabular}
Apr 22, 2020
473
\caption{\label{tbl:SMBCommands2}Breakdown of General Operations for SMB2 from April 30th, 2019 to May 20th, 2019.}
474
\vspace{-2em}
475
\end{table}
476
477
\subsection{I/O Data Request Sizes}
478
%\textcolor{red}{Figures~\ref{fig:IO-All} and~\ref{fig:IO-R+W} show the amount of I/O in 15-minute periods during the week of March 12-18, 2017.
479
%The general I/O (GIO) value is representative of I/O that does not include read, write, or create actions. For the most part, these general I/O are mostly metadata operations. As one can see in Figure~\ref{fig:IO-All}, the general I/O dominates any of the read or write operations. Figure~\ref{fig:IO-R+W} is a magnification of the read and write I/O from Figure~\ref{fig:IO-All}. Here we see that the majority of I/O operations belong to reads. There are some spikes where more write I/O occur, but these events are in the minority. One should also notice that, as would be expected, the spikes of I/O activity occur around the center of the day (e.g. 8am to 8pm), and during the week (March 12 was a Sunday and March 18 was a Saturday).}
480
481
%\begin{figure}
482
% \includegraphics[width=0.5\textwidth]{./images/AIO.pdf}
483
% \caption{All I/O}
484
% \label{fig:IO-All}
485
%\end{figure}
486
%\begin{figure}
487
% \includegraphics[width=0.5\textwidth]{./images/RWIO-win.pdf}
488
% \caption{Read and Write I/O}
489
% \label{fig:IO-R+W}
490
%\end{figure}
Jan 16, 2020
491
Each SMB Read and Write command is associated with a data request size that indicates how many bytes are to be read or written as part of that command.
492
Figure~\ref{fig:SMB-Bytes-IO} %and~\ref{fig:PDF-Bytes-Write}
493
shows the probability density function (PDF) of the different sizes of bytes transferred for read and write I/O operations respectively. The most noticeable aspect of these graphs are that the majority of bytes transferred for read and write operations is around 64 bytes. It is worth noting that write I/Os also have a larger number of very small transfer amounts. This is unexpected in terms of the amount of data passed in a frame. Part of the reason is due to a large number of long term %calculations/
494
scripts that only require small but frequent updates, as we observed several
495
%. This assumption was later validated in part when examining the files transferred, as some were related to
496
running scripts creating a large volume of files. A more significant reason was because we noticed Microsoft Word would perform a large number of small reads at ever growing offsets. This was interpreted as when a user is viewing a document over the network and Word would load the next few lines of text as the user scrolled down the document; causing ``loading times'' amid use. Finally, a large degree of small writes were observed to be related to application cookies or other such smaller data communications.
497
%This could also be attributed to simple reads relating to metadata\textcolor{red}{???}
498
Duncan
Apr 23, 2020
499
%\textcolor{blue}{Reviewing of the SMB and SMB2 leads to some confusion in understanding this behavior. According to the specification the default ``MaxBuffSize'' for reads and writes should be between 4,356 bytes and 16,644 bytes depending on the use of either a client version of server version of Windows; respectively. In the SMB2 protocol specification, specific version of Windows (e.g. Vista SP1, Server 2008, 7, Server 2008 R2, 8, Server 2012, 8.1, Server 2012 R2) disconnect if the ``MaxReadSize''/``MaxWriteSize'' value is less than 4096. However, further examination of the specification states that for SMB2 the read length and write length can be zero. Thus, this seems to conflict that the size has to be greater than 4096 but allows for it to also be zero. It is due to this protocol specification of allowing zero that supports the smaller read/write sizes seen in the captured traffic. The author's assumption here is that the university's configuration allows for smaller traffic to be exchanged without the disconnection for sizes smaller than 4096.}
501
%\begin{figure}
502
% \includegraphics[width=0.5\textwidth]{./images/aggAvgBytes.pdf}
503
% \caption{Average Bytes by I/O}
504
% \label{fig:Agg-AvgBytes}
505
%\end{figure}
506
%
507
%\begin{figure}
508
% \includegraphics[width=0.5\textwidth]{./images/bytesCompare.pdf}
509
% \caption{Total Bytes by I/O}
510
% \label{fig:bytesCompare}
511
%\end{figure}
512
513
%\begin{figure}[t]
514
% \includegraphics[width=0.5\textwidth]{./images/smb_read_bytes_pdf.png}
515
% \vspace{-2em}
516
% \caption{PDF of Bytes Transferred for Read I/O}
517
% \label{fig:PDF-Bytes-Read}
518
%\end{figure}
519
%
520
%\begin{figure}[t]
521
% \includegraphics[width=0.5\textwidth]{./images/smb_read_bytes_cdf.png}
522
% \vspace{-2em}
523
% \caption{CDF of Bytes Transferred for Read I/O}
524
% \label{fig:CDF-Bytes-Read}
525
%\end{figure}
526
%
527
%\begin{figure}[t]
528
% \includegraphics[width=0.5\textwidth]{./images/smb_write_bytes_pdf.png}
529
% \vspace{-2em}
530
% \caption{PDF of Bytes Transferred for Write I/O}
531
% \label{fig:PDF-Bytes-Write}
532
%\end{figure}
533
%
534
%\begin{figure}[t]
535
% \includegraphics[width=0.5\textwidth]{./images/smb_write_bytes_cdf.png}
536
% \vspace{-2em}
537
% \caption{CDF of Bytes Transferred for Write I/O}
538
% \label{fig:CDF-Bytes-Write}
539
%\end{figure}
Jan 16, 2020
541
\begin{figure}[t]
542
\includegraphics[width=0.5\textwidth]{./images/smb_2019_bytes_pdf.png}
Duncan
Feb 2, 2020
543
\vspace{-2em}
544
\caption{PDF and CDF of Bytes Transferred for Read and Write I/O}
545
\label{fig:SMB-Bytes-IO}
546
\end{figure}
547
548
%\begin{figure}
549
% \includegraphics[width=0.5\textwidth]{./images/CDF-ioBuff-win.pdf}
550
% \caption{CDF of Bytes Transferred for Read+Write I/O}
551
% \label{fig:CDF-Bytes-RW}
552
%\end{figure}
553
Figure~\ref{fig:SMB-Bytes-IO} %and~\ref{fig:CDF-Bytes-Write}
554
shows cumulative distribution functions (CDF) for bytes read and bytes written. As can be seen, almost no read transfer sizes are less than 32 bytes, whereas 20\% of the writes are smaller than 32 bytes. Table~\ref{fig:transferSizes} shows a tabular view of this data. For reads, $34.97$\% are between 64 and 512 bytes, with another $28.86$\% at 64 byte request sizes. There are a negligible percentage of read requests larger than 512.
555
This read data differs from the size of reads observed by Leung et al. by a factor of four smaller.
556
%This read data is similar to what was observed by Leung et al, however at an order of magnitude smaller.
557
%Writes observed also differ from previous inspection of the protocol's usage. % are very different.
558
Leung et al. showed that $60$-$70$\% of writes were less than 4K in size and $90$\% less than 64K in size. In our data, however, we see that almost all writes are less than 1K in size. In fact, $11.16$\% of writes are less than 4 bytes, $52.41$\% are 64 byte requests, and $43.63$\% of requests are less than 64 bytes.
559
In the ten years since the last study, it is clear that writes have become significantly smaller. In our analysis of a subset of the writes, we found that a significant part of the write profile was writes to cookies which are necessarily small files. The preponderance of web applications and the associated tracking is a major change in how computers and data storage are used compared to a decade ago. These small data reads and writes significantly alter the assumptions that most network storage systems are designed for.
Feb 3, 2020
560
%This may be explained by the fact that large files, and multiple files, are being written as standardized blocks more fitting to the frequent update of larger data-sets and disk space available. This could be as an effort to improve the fidelity of data across the network, allow for better realtime data consistency between client and backup locations, or could just be due to a large number of scripts being run that create and update a series of relatively smaller documents.
561
%\textbf{Note: It seems like a change in the order of magnitude that is being passed per packet. What would this indicate?}\textcolor{red}{Answer the question. Shorter reads/writes = better?}
Jan 16, 2020
563
\begin{table}[]
564
\centering
565
\begin{tabular}{|l|c|c|}
566
\hline
567
Transfer size & Reads & Writes \\ \hline
568
$< 4$ & 0.098\% & 11.16\% \\
569
$= 4$ & 1.16\% & 4.13\% \\
570
$>4, < 64$ & 34.89\% & 28.14\% \\
571
$= 64$ & 28.86\% & 52.41\% \\
572
$>64, < 512$ & 34.97\% & 4.15\% \\
573
$= 512$ & 0.002\% & 2.54e-5\% \\
574
$= 1024$ & 1.22e-5\% & 3.81e-5\% \\ \hline
575
\end{tabular}
576
\caption{\label{fig:transferSizes}Percentage of transfer sizes for reads and writes}
577
\vspace{-2em}
578
\end{table}
579
Jan 16, 2020
580
In comparison of the read, write, and create operations we found that the vast majority
581
of these type of I/O belong to creates. By the fact that there are so many creates, it
582
seems apparent that many applications create new files rather than updating existing
583
files when files are modified. Furthermore, read operations account for the largest aggregate of bytes transferred over the network. However, the amount of bytes transferred by write commands is not far behind, although, non-intuitively, including a larger number of standardized relatively smaller writes. The most unexpected finding of the data is that all the the read and writes are performed using much smaller buffers than expected; about an order of magnitude smaller (e.g. bytes instead of kilobytes).
584
585
% XXX I think we should get rid of this figure - not sure it conveys anything important that is not better conveyed than the CDF
586
%Figure~\ref{fig:Agg-AvgRT} shows the average response time (RT) for the different I/O operations. The revealing information is that write I/Os take the longest average time. This is expected since writes transfer more data on average. There is an odd spike for create I/O which can be due to a batch of files or nested directories being made. There are points where read I/O RT can be seen, but this only occurs in areas where large RT for write I/O occur. This is attributed to a need to verify the written data.
587
588
%\begin{figure}
589
% \includegraphics[width=0.5\textwidth]{./images/aggAvgRTs-windowed.pdf}
590
% \caption{Average Response Time by I/O Operation}
591
% \label{fig:Agg-AvgRT}
592
%\end{figure}
593
594
% XXX I think we should get rid of this figure - not sure it conveys anything important that is not better conveyed than the CDF
595
%Figure~\ref{fig:Agg-AvgBytes} shows the average inter arrival time (IAT) for the different I/O operations. \textcolor{red}{Issue: Data only exists for general operations, NOT for other operations. In other words, data for all other operations was IAT of zero.} \textcolor{blue}{Idea: This is due to single operation by a single user and then no operation being performed again. This would aligns with the ideas of Lueng et.~al.~who noticed that files were being interacted with only once or twice and then not again.}
596
597
%\begin{figure}
598
% \includegraphics[width=0.5\textwidth]{./images/aggAvgIATs-windowed.pdf}
599
% \caption{Average Inter Arrival Time by I/O Operation}
600
% \label{fig:Agg-AvgIAT}
601
%\end{figure}
602
603
%The following is a list of data collected and why:
604
%\begin{itemize}
605
% \item TID-to-IP map: with the hashing, the only way to maintain mapping of `share-types' (i.e. share-paths) to TIDs is via IP (reverse DNS).
606
% \item FID Data: holds the number of reads, writes, and size of the FID (tracked) for which this information is tracked (per FID).
607
% \item Tuple Data: holds the reads and writes performed by a seen tuple (per tuple) along with by the tuple and FID's data.
608
% \item TID Data: holds the number of reads, writes, creates, and total I/O events along with the last time each/any command was seen. Maps are kept of the buffs seen, general IAT, read IAT, write IAT, create IATs.
609
% \item Tuple Info: Tracking the tuples seen along with a map to that tuple's (per tuple) data.
610
% \item Oplock Data: Tracks the different types of oplocks that are seen per 15 minutes.
611
% \item Read/Write Buff: Maps that are used to track the different sized buffers used for Read/Write commands.
612
% \item `filesizeMap': Used for track the different sized buffers to pass data along the network (generic and all inclusive; ie. tuple level data).
613
% \item I/O Events: Track the number of I/O events seen in 15 minute periods. I/Os include - read, write, create, general.
614
%\end{itemize}
615
616
\subsection{I/O Response Times}
617
Jan 16, 2020
618
%~!~ Addition since Chandy writing ~!~%
619
Most previous tracing work has not reported I/O response times or command latency which is generally proportional to data request size, but under load, the response times give an indication of server load. In
620
Table~\ref{tbl:PercentageTraceSummary} we show a summary of the response times for read, write, create, and general commands. We note that most general (metadata) operations occur fairly frequently, run relatively slowly, and happen at high frequency.
621
We also observe that the number of writes is very close to the number of reads. The write response time for their operations is very small - most likely because the storage server caches the write without actually committing to disk. Reads, on the other hand, are in most cases probably not going to hit in the cache and require an actual read from the storage media. Although read operations are only a small percentage of all operations, they have the highest average response time. As noted above, creates happen more frequently, but have a slightly slower response time, because of the extra metadata operations required for a create as opposed to a simple write.
Jan 16, 2020
622
623
% Note: RT + IAT time CDFs exist in data output
624
625
% IAT information
626
627
%\begin{figure}[t!]
628
% \includegraphics[width=0.5\textwidth]{./images/smb_general_iats_cdf.png}
629
% \caption{CDF of Inter Arrival Time for General I/O}
630
% \label{fig:CDF-IAT-General}
631
%\end{figure}
632
%
633
%\begin{figure}[t!]
634
% \includegraphics[width=0.5\textwidth]{./images/smb_general_iats_pdf.png}
635
% \caption{PDF of Inter Arrival Time for General I/O}
636
% \label{fig:PDF-IAT-General}
637
%\end{figure}
638
%
639
%\begin{figure}[t!]
640
% \includegraphics[width=0.5\textwidth]{./images/smb_general_rts_cdf.png}
641
% \caption{CDF of Response Time for General I/O}
642
% \label{fig:CDF-RT-General}
643
% \vspace{-2em}
644
%\end{figure}
645
%
646
%\begin{figure}[t!]
647
% \includegraphics[width=0.5\textwidth]{./images/smb_general_rts_pdf.png}
648
% \caption{PDF of Response Time for General I/O}
649
% \label{fig:PDF-RT-General}
650
% \vspace{-2em}
651
%\end{figure}
652
Jan 16, 2020
653
\begin{figure}[t!]
654
\includegraphics[width=0.5\textwidth]{./images/smb_2019_iats_cdf.png}
655
\caption{CDF of Inter-Arrival Time for SMB I/O}
656
\label{fig:CDF-IAT-SMB}
657
%\vspace{-2em}
Jan 16, 2020
658
\end{figure}
659
660
\begin{figure}[t!]
661
\includegraphics[width=0.5\textwidth]{./images/smb_2019_iats_pdf.png}
662
\caption{PDF of Inter-Arrival Time for SMB I/O}
663
\label{fig:PDF-IAT-SMB}
664
%\vspace{-2em}
Jan 16, 2020
665
\end{figure}
666
667
\begin{figure}[t!]
668
\includegraphics[width=0.5\textwidth]{./images/smb_2019_rts_cdf.png}
669
\caption{CDF of Response Time for SMB I/O}
670
\label{fig:CDF-RT-SMB}
671
%\vspace{-2em}
Jan 16, 2020
672
\end{figure}
673
674
\begin{figure}[t!]
675
\includegraphics[width=0.5\textwidth]{./images/smb_2019_rts_pdf.png}
676
\caption{PDF of Response Time for SMB I/O}
677
\label{fig:PDF-RT-SMB}
678
%\vspace{-2em}
Jan 16, 2020
679
\end{figure}
680
681
\begin{table}[]
682
\centering
Jan 16, 2020
683
\begin{tabular}{|l|r|r|r|r|}
684
\hline
685
& Reads & Writes & Creates & General \\ \hline
Jan 16, 2020
686
I/O \% & 2.97 & \multicolumn{1}{r|}{2.80} & \multicolumn{1}{r|}{19.36} & \multicolumn{1}{r|}{74.87} \\ \hline
687
Avg RT ($\mu$s) & 59,819.7 & \multicolumn{1}{r|}{519.7} & \multicolumn{1}{r|}{698.1} & \multicolumn{1}{r|}{7,013.4} \\ \hline
688
Avg IAT ($\mu$s) & 33,220.8 & \multicolumn{1}{r|}{35,260.4} & \multicolumn{1}{r|}{5,094.5} & \multicolumn{1}{r|}{1,317.4} \\ \hline
689
%\hline
690
%Total RT (s) & 224248 & \multicolumn{1}{l|}{41100} & \multicolumn{1}{l|}{342251} & \multicolumn{1}{l|}{131495} \\ \hline
691
%\% Total RT & 30.34\% & \multicolumn{1}{l|}{5.56\%} & \multicolumn{1}{l|}{46.3\%} & \multicolumn{1}{l|}{17.79\%} \\ \hline
692
\end{tabular}
693
\caption{Summary of Trace Statistics: Average Response Time (RT) and Inter Arrival Time (IAT)}
694
\label{tbl:PercentageTraceSummary}
695
\vspace{-2em}
696
\end{table}
697
698
%\begin{table}[]
699
%\centering
700
%\begin{tabular}{|l|l|l|l|l|l|}
701
%\hline
702
% & Reads & Writes & Creates & General R-W \\ \hline
703
%Total RT (ms) & 224248442 & \multicolumn{1}{l|}{41100075} & \multicolumn{1}{l|}{342251439} & \multicolumn{1}{l|}{131495153} & \multicolumn{1}{l|}{258573201} \\ \hline
704
%\% Total RT & 30.34\% & \multicolumn{1}{l|}{5.56\%} & \multicolumn{1}{l|}{46.3\%} & \multicolumn{1}{l|}{17.79\%} & \multicolumn{1}{l|}{34.99\%} \\ \hline
705
%\end{tabular}
706
%\caption{Summary of Response Time (RT) Statistics: Total RT and Percentage RT per Operation}
707
%\label{tbl:PercentageRTSummary}
708
%\end{table}
709
710
%\textcolor{red}{To get an indication of how much of an effect these general commands take on overall latency, we also calculated the total aggregate response time for read, write, create, and general operations. We see that even though general commands account for $74.87$\% of all commands, they only account for only $17.8$\% of the total response time. Thus, while the volume of general operations does not present an extraordinary burden on server load, reducing these operations can present a clear performance benefit. We also see that creates take the most amount of time ($46.3$\%) of the total response time for all operations. As seen in Table~\ref{tbl:SMBCommands}, the majority of general operations are negotiations while $28.71$\% are closes; which relate to create operations.
711
%This shows that while creates are only $5.08$\% on March 15th (and $2.5$\% of the week's operations shown in Table~\ref{tbl:PercentageTraceSummary}) of the total operations performed, they are responsible for $46.3$\% of the time spent performing network I/O.}
712
%\textbf{Do we need this above data piece?}
713
%
714
%% Not Needed to Say Since we have no data
715
%%One key observation is that there were no inter arrival time calculations for read, write, or create operations. We interpret this data to reflect the observations of Leung et.~al.~that noticed that files are interacted with only a few times and then not interacted with again. Extrapolating this concept, we interpret the data to illustrate that files may be read or written once, but then are not examined or interacted with again.
716
%%\textcolor{blue}{This was entirely unexpected and was discovered as a result of our original assumptions made based on what scope we believed to be the best interpretation of user activity on the network filesystem.}
717
%
718
%%\begin{table}[]
719
%%\centering
720
%%\begin{tabular}{|l|l|}
721
%%\hline
722
%% & Count \\ \hline
723
%%Sessions & 122 \\ \hline
724
%%Non-Sessions & 2 \\ \hline
725
%%\end{tabular}
726
%%\caption{Summary of Maximum Session and Non-Session Seen}
727
%%\label{tbl:Counts}
728
%%\end{table}
729
%%
730
%%\textcolor{red}{Not sure if presenting a count of the number of sessions seen is important or worth showing.}
731
%
732
%%\begin{table}[]
733
%%\centering
734
%%\begin{tabular}{|l|l|l|}
735
%%\hline
736
%% & Reads & Writes \\ \hline
737
%%Average & 27167.76 B & 106961.36 B \\ \hline
738
%%Percentage & 99.4\% & 0.6\% \\ \hline
739
%%\end{tabular}
740
%%\caption{Summary of Bytes Transferred Over the Network}
741
%%\label{tbl:Bytes}
742
%%\end{table}
743
%
744
%%\textcolor{red}{Reference the large single table instead}
745
%%Table~\ref{tbl:TraceSummary} shows our findings relating to the total number of bytes transferred over the network due to Read and Write operations. Mimicing the findings from Figure~\ref{fig:Agg-AvgBytes}, the table shows that while the percentage of total bytes passed over the network is dominated by Read operations the average bytes pushed by Write operations is of a magnitude greater.
746
%
747
%%Tables to be included:
748
%%\begin{enumerate}
749
%% \item Return Times:
750
%% \begin{itemize}
751
%% \item General
752
%% \item Read
753
%% \item Write
754
%% \item Create
755
%% \item Read+Write
756
%% \end{itemize}
757
%% \item Inter Arrival Times
758
%% \begin{itemize}
759
%% \item General
760
%% \item Read
761
%% \item Write
762
%% \item Create
763
%% \item Read+Write
764
%% \end{itemize}
765
%% \item Bytes per Request (Bytes Over Network)
766
%% \begin{itemize}
767
%% \item Read
768
%% \item Write
769
%% \item Read+Write
770
%% \end{itemize}
771
%%\end{enumerate}
772
%%Modeling to include:
773
%%\begin{enumerate}
774
%% \item Inter Arrival Time CDF
775
%% \begin{itemize}
776
%% \item Read
777
%% \item Write
778
%% \item Read+Write
779
%% \end{itemize}
780
%%\end{enumerate}
781
%
782
Figures~\ref{fig:CDF-IAT-SMB} and~\ref{fig:PDF-IAT-SMB} shows the inter arrival times CDFs and PDFs. As can be seen, SMB commands happen very frequently - $85$\% of commands are issued less than 1000~$\mu s$ apart. As mentioned above, SMB is known to be very chatty, and it is clear that servers must spend a lot of time dealing with these commands. For the most part, most of these commands are also serviced fairly quickly as
783
seen in Figures~\ref{fig:CDF-RT-SMB} and~\ref{fig:PDF-RT-SMB}. Interestingly, the response time for the general metadata operations follows a similar curve to the inter-arrival times.
785
%Next we examine the response time (RT) of the read, write, and create I/O operations that occur over the SMB network filesystem.
786
The response time for write operations (shown in Figure~\ref{fig:CDF-RT-SMB}) does not follow the step function similar to the bytes written CDF in Figure~\ref{fig:SMB-Bytes-IO}. This is understandable as the response time for a write would be expected to be a more standardized action and not necessarily proportional to the number of bytes written. However, the read response time %(Figure~\ref{fig:CDF-RT-SMB})
787
is smoother than the bytes read CDF (Figure~\ref{fig:SMB-Bytes-IO}). This is most likely due to the fact that some of the reads are satisfied by server caches, thus eliminating some long access times to persistent storage.
788
However, one should notice that the response time on read operations grows at a rate similar to that of write operations. This, again, shows a form of standardization in the communication patterns although some read I/O take a far greater period of time; due to larger amounts of read data sent over several standardized size packets.
789
%While the RT for Write operations are not included (due to their step function behavior) Figure~\ref{fig:CDF-RT-Read} and Figure~\ref{fig:CDF-RT-RW} show the response times for Read and Read+Write operations respectively. T
790
%\textcolor{red}{The write I/O step function behavior is somewhat visible in the CDF of both reads and writes in Figures~\ref{fig:CDF-RT-Read}~and~\ref{fig:CDF-RT-Write}. Moreover, this shows that the majority ($80$\%) of read (and write) operations occur within 2~$ms$, the average access time for enterprise storage disks. As would be expected, this is still an order of magnitude greater than the general I/O.}
791
792
%\begin{figure}[tp!]
793
% \includegraphics[width=0.5\textwidth]{./images/smb_read_iats_cdf.png}
Duncan
Feb 2, 2020
794
% \vspace{-2em}
795
% \caption{CDF of Inter Arrival Time for Read I/O}
796
% \label{fig:CDF-IAT-Read}
797
%\end{figure}
798
%
799
%\begin{figure}[tp!]
800
% \includegraphics[width=0.5\textwidth]{./images/smb_read_iats_pdf.png}
Duncan
Feb 2, 2020
801
% \vspace{-2em}
802
% \caption{PDF of Inter Arrival Time for Read I/O}
803
% \label{fig:PDF-IAT-Read}
804
%\end{figure}
805
%
806
%\begin{figure}[tp!]
807
% \includegraphics[width=0.5\textwidth]{./images/smb_read_rts_cdf.png}
808
% \vspace{-2em}
809
% \caption{CDF of Response Time for Read I/O}
810
% \label{fig:CDF-RT-Read}
811
%% \vspace{-2em}
812
%\end{figure}
813
%
814
%\begin{figure}[tp!]
815
% \includegraphics[width=0.5\textwidth]{./images/smb_read_rts_pdf.png}
816
% \vspace{-2em}
817
% \caption{PDF of Response Time for Read I/O}
818
% \label{fig:PDF-RT-Read}
819
%% \vspace{-2em}
820
%\end{figure}
Jan 16, 2020
822
% RTs information
823
824
%\begin{figure}[t!]
825
% \includegraphics[width=0.5\textwidth]{./images/smb_write_iats_cdf.png}
Duncan
Feb 2, 2020
826
% \vspace{-2em}
827
% \caption{CDF of Inter Arrival Time for Write I/O}
828
% \label{fig:CDF-IAT-Write}
829
%\end{figure}
830
%
831
%\begin{figure}[t!]
832
% \includegraphics[width=0.5\textwidth]{./images/smb_write_iats_pdf.png}
Duncan
Feb 2, 2020
833
% \vspace{-2em}
834
% \caption{PDF of Inter Arrival Time for Write I/O}
835
% \label{fig:PDF-IAT-Write}
836
%\end{figure}
837
%
838
%\begin{figure}[t!]
839
% \includegraphics[width=0.5\textwidth]{./images/smb_write_rts_cdf.png}
Duncan
Feb 2, 2020
840
% \vspace{-2em}
841
% \caption{CDF of Return Time for Write IO}
842
% \label{fig:CDF-RT-Write}
843
%% \vspace{-2em}
844
%\end{figure}
845
%
846
%\begin{figure}[t!]
847
% \includegraphics[width=0.5\textwidth]{./images/smb_write_rts_pdf.png}
Duncan
Feb 2, 2020
848
% \vspace{-2em}
849
% \caption{PDF of Return Time for Write IO}
850
% \label{fig:PDF-RT-Write}
851
%% \vspace{-2em}
852
%\end{figure}
853
%
854
%\begin{figure}[t!]
855
% \includegraphics[width=0.5\textwidth]{./images/smb_create_iats_cdf.png}
856
% \caption{CDF of Inter Arrival Time for Create I/O}
857
% \vspace{-2em}
858
% \label{fig:CDF-IAT-Create}
859
%\end{figure}
860
%
861
%\begin{figure}[t!]
862
% \includegraphics[width=0.5\textwidth]{./images/smb_create_iats_pdf.png}
863
% \vspace{-2em}
864
% \caption{PDF of Inter Arrival Time for Create I/O}
865
% \label{fig:PDF-IAT-Create}
866
%\end{figure}
867
%
868
%\begin{figure}[t!]
869
% \includegraphics[width=0.5\textwidth]{./images/smb_create_rts_cdf.png}
870
% \vspace{-2em}
871
% \caption{CDF of Response Time for Create I/O}
872
% \label{fig:CDF-RT-Create}
873
%% \vspace{-2em}
874
%\end{figure}
875
%
876
%\begin{figure}[t!]
877
% \includegraphics[width=0.5\textwidth]{./images/smb_create_rts_pdf.png}
878
% \vspace{-2em}
879
% \caption{PDF of Response Time for Create I/O}
880
% \label{fig:PDF-RT-Create}
881
%% \vspace{-2em}
882
%\end{figure}
883
884
%\begin{figure}
885
% \includegraphics[width=0.5\textwidth]{./images/CDF-ioRT-win.pdf}
886
% \caption{CDF of Response Time for Read+Write I/ O}
887
% \label{fig:CDF-RT-RW}
888
%\end{figure}
889
890
%\begin{figure}
891
% \includegraphics[width=0.5\textwidth]{./images/CDF-rBuff-win.pdf}
892
% \caption{CDF of Bytes Transferred for Read IO}
893
% \label{fig:CDF-Bytes-Read}
894
%\end{figure}
895
896
%\begin{figure}
897
% \includegraphics[width=0.5\textwidth]{./images/CDF-wBuff-win.pdf}
898
% \caption{CDF of Bytes Transferred for Write IO}
899
% \label{fig:CDF-Bytes-Write}
900
%\end{figure}
901
902
%\begin{figure}
903
% \includegraphics[width=0.5\textwidth]{./images/CDF-ioBuff-win.pdf}
904
% \caption{CDF of Bytes Transferred for Read+Write IO}
905
% \label{fig:CDF-Bytes-RW}
906
%\end{figure}
907
Jan 16, 2020
908
\subsection{File Extensions}
909
Tables~\ref{tab:top10SMB2FileExts} and~\ref{tab:commonSMB2FileExts} show a summary of the various file extensions that were seen within the SMB2 traffic during the three-week capture period; following the \textit{smb2.filename} field. The easier to understand is Table~\ref{tab:commonSMB2FileExts}, which illustrates the number of common file extensions (e.g. doc, ppt, xls, pdf) that were part of the data.
Jan 16, 2020
910
%The greatest point of note is that the highest percentage is ``.xml'' with $0.54$\%, which is found to be surprising result.
911
Originally we expected that these common file extensions would be a much larger total of traffic. However, as seen in Table~\ref{tab:commonSMB2FileExts}, these common file extensions were less than $2$\% of total files seen. The top ten extensions that we saw (Table~\ref{tab:top10SMB2FileExts}) comprised approximately $84$\% of the total seen.
912
Furthermore, the majority of extensions are not readily identified.
913
Upon closer examination of the tracing system it was determined that
914
%these file extensions are an artifact of how Windows interprets file extensions. The Windows operating system merely guesses the file type based on the assumed extension (e.g. whatever characters follow after the final `.').
915
many files simply do not have a valid extension. These range from Linux-based library files, man pages, odd naming schemes as part of scripts or back-up files, as well as date-times and IPs as file names. There are undoubtedly more, but exhaustive determination of all variations is seen as out of scope for this work.
Jan 16, 2020
916
Duncan
Apr 23, 2020
917
%\textcolor{red}{Add in information stating that the type of OS in use in the university environment range from Windows, Unix, BSD, as well as other odd operating systems used by the engineering department.}
918
Jan 16, 2020
919
\begin{table}[]
920
\centering
921
\begin{tabular}{|l|l|l|}
922
\hline
923
SMB2 Filename Extension & Occurrences & Percentage of Total \\ \hline
924
-Travel & 33,396,147 & 15.26 \\
925
o & 28,670,784 & 13.1 \\
926
e & 28,606,421 & 13.07 \\
927
N & 27,639,457 & 12.63 \\
928
one & 27,615,505 & 12.62 \\
929
\textless{}No Extension\textgreater{} & 27,613,845 & 12.62 \\
930
d & 2,799,799 & 1.28 \\
931
l & 2,321,338 & 1.06 \\
932
x & 2,108,279 & 0.96 \\
933
h & 2,019,714 & 0.92 \\ \hline
Jan 16, 2020
934
\end{tabular}
Duncan
Feb 2, 2020
935
\caption{Top 10 File Extensions Seen Over Three Week Period}
Jan 16, 2020
936
\label{tab:top10SMB2FileExts}
937
\end{table}
938
939
\begin{table}[]
940
\centering
941
\begin{tabular}{|l|l|l|}
942
\hline
943
SMB2 Filename Extension & Occurrences & Percentage of Total \\ \hline
944
doc & 352,958 & 0.16 \\
945
docx & 291,047 & 0.13 \\
946
ppt & 46,706 & 0.02 \\
947
pptx & 38,604 & 0.02 \\
948
xls & 218,031 & 0.1 \\
949
xlsx & 180,676 & 0.08 \\
Jan 16, 2020
950
odt & 28 & 0.000013 \\
951
pdf & 375,601 & 0.17 \\
952
xml & 1,192,840 & 0.54 \\
953
txt & 167,827 & 0.08 \\ \hline
Jan 16, 2020
954
\end{tabular}
Duncan
Feb 2, 2020
955
\caption{Common File Extensions Seen Over Three Week Period}
Jan 16, 2020
956
\label{tab:commonSMB2FileExts}
957
\end{table}
958
959
%Points worth mentioning:
960
%\begin{itemize}
961
% \item Scale of time is only to the microsecond due to the original pcap file capturing process. \texttt{tshark} only captures to a microsecond scale in our implementation.
962
% \item Due to a complication of how DataSeries stores information, there are potentially more SMB2 packets than actually occurred since $0$ is an acceptable command for SMB2 (although not used for SMB).
963
%\end{itemize}
964
965
\subsection{Distribution Models}
966
Jan 16, 2020
967
For simulations and analytic modeling, it is often useful to have models that describe storage systems I/O behavior. In this section, we attempt to map traditional probabilistic distributions to the data that we have observed.
968
Specifically, taking the developed CDF graphs, we perform curve fitting to determine the applicability of Gaussian and Weibull distributions to the the network filesystem I/O behavior. Note that an exponential distribution, typically used to model interarrival times and response times, is a special case of a Weibull distribution where $k=1$.
969
Table~\ref{tbl:curveFitting} shows best-fit parametrized distributions for the measured data. % along with $R^2$ fitness values.
970
971
%Based on the collected IAT and RT data, the following are the best fit curve representation equations with supporting $R^{2}$ values. In the case of each, it was found that the equation used to model the I/O behavior was a Gaussian equation with a single term.
972
%\begin{equation} f(x) = a_1 * e^{-((x-b_1)/c_1)^2)} \end{equation}
973
%The $R^2$ values for each CDF graph were found to be the following:
974
%\begin{itemize}
975
% \item General Command IAT CDF, shown in Figure~\ref{fig:CDF-IAT-General}, had $R^2$ Value of $0.6704$.
976
% \item General Command RT CDF, shown in Figure~\ref{fig:CDF-RT-General}, had $R^2$ Value of $0.9728$.
977
% \item Read command RT CDF, shown in Figure~\ref{fig:CDF-RT-Read}, had $R^2$ Value of $0.7754$.
978
% \item Write command RT CDF, shown in Figure~\ref{fig:CDF-RT-Write}, had $R^2$ Value of $0.7797$
979
% \item Create command RT CDF, shown in Figure~\ref{fig:CDF-RT-Create}, had $R^2$ Value of $0.07146$
980
% \item Read + Write command RT CDF, shown in Figure~\ref{fig:CDF-RT-RW}, has $R^2$ Value of $0.7837$.
981
%\end{itemize}
982
983
\begin{table*}
984
\centering
985
\begin{tabular}{|l|c|c|c||c|c|c|}
986
\hline
987
Model & \multicolumn{3}{|c|}{Gaussian}
988
& \multicolumn{3}{|c|}{Weibull} \\ \hline
989
CDF & \multicolumn{3}{|c|}{$\frac{1}{\sqrt{2\pi}}\int_{-\infty}^{\frac{x-\mu}{\sigma}}e^{\frac{-t^2}{2}}dt$}
990
& \multicolumn{3}{|c|}{$1 - e^{(-x/\lambda)^k}$} \\ \hline \hline
991
I/O Operation & $\mu$ & \multicolumn{2}{|c|}{$\sigma$} & $k$ & \multicolumn{2}{|c|}{$\lambda$} \\ \hline
992
General RT & 3606.66$\pm$742.44 & \multicolumn{2}{|c|}{2.74931e+06$\pm$530} & 0.5652$\pm$0.0001 & \multicolumn{2}{|c|}{980.9721$\pm$0.4975} \\
993
General IAT & 786.72$\pm$2.79 & \multicolumn{2}{|c|}{10329.6$\pm$2} & 0.9031$\pm$0.0002 & \multicolumn{2}{|c|}{743.2075$\pm$0.2341} \\
994
Read RT & 44718.5$\pm$11715 & \multicolumn{2}{|c|}{1.72776e+07$\pm$8300} & 0.0004$\pm$0.0 & \multicolumn{2}{|c|}{1.5517$\pm$0.0028} \\
995
Read IAT & 24146$\pm$8062 & \multicolumn{2}{|c|}{1.189e+07$\pm$5700} & 0.0005$\pm$0.0 & \multicolumn{2}{|c|}{3.8134$\pm$0.0057} \\
996
Write RT & 379.823$\pm$2.809 & \multicolumn{2}{|c|}{4021.72$\pm$1.99} & 0.8569$\pm$0.0004 & \multicolumn{2}{|c|}{325.2856$\pm$0.2804} \\
997
Write IAT & 25785.7$\pm$8556.6 & \multicolumn{2}{|c|}{1.22491e+07$\pm$6000} & 0.0004$\pm$0.0 & \multicolumn{2}{|c|}{3.1287$\pm$0.0052} \\
998
Create RT & 502.084$\pm$5.756 & \multicolumn{2}{|c|}{21678.4$\pm$4.1} & 0.9840$\pm$0.0002 & \multicolumn{2}{|c|}{496.9497$\pm$0.1403} \\
999
Create IAT & 3694.82$\pm$1236.16 & \multicolumn{2}{|c|}{4.65553e+06$\pm$880} & 0.0008$\pm$0.0 & \multicolumn{2}{|c|}{2.3504$\pm$0.0009} \\ \hline
1000
%R+W RT & \textcolor{red}{0.8045} & \multicolumn{2}{|c|}{\textcolor{red}{0.2122}} & \textcolor{red}{5.103} & \multicolumn{2}{|c|}{\textcolor{red}{0.3937}} \\ \hline
1001
%R+W Byte Transfer & \textcolor{red}{0.3744} & \multicolumn{2}{|c|}{\textcolor{red}{0.2983}} & \textcolor{red}{1.153} & \multicolumn{2}{|c|}{\textcolor{red}{0.3937}} \\
1002
Read Buff Transfer & 82.9179$\pm$0.7641 & \multicolumn{2}{|c|}{1117.9$\pm$0.54} & 1.0548$\pm$0.0003 & \multicolumn{2}{|c|}{85.2525$\pm$0.0575} \\
1003
Write Buff Transfer & 46.2507$\pm$0.4475 & \multicolumn{2}{|c|}{640.621$\pm$0.316} & 1.0325$\pm$0.0004 & \multicolumn{2}{|c|}{46.8707$\pm$0.0328} \\ \hline
1004
\end{tabular}
1005
\caption{\label{tbl:curveFitting}Comparison of %$R^2$
1006
$\mu$, $\sigma$, $k$, and $\lambda$ Values for Curve Fitting Equations on CDF Graphs}
1007
\vspace{-3em}
1008
\end{table*}
1009
1010
%The graphs created by the dissection script are:
1011
%\begin{itemize}
1012
% \item Average IAT (G/R/W/C) - By DateTime.
1013
% \item Average Bytes (R/W) - By DateTime.
1014
% \item Session I/Os (G/R/W/C) - By DateTime.
1015
% \item Non-Session I/Os (G/R/W/C) - By DateTime.
1016
% \item Tuple Counts - By DateTime.
1017
% \item Total Bytes (R+W/R/W) - By DateTime.
1018
% \item Total I/Os (G/R/W) - By DateTime.
1019
%\end{itemize}
1020
1021
%Observations on graphs:
1022
%\begin{itemize}
1023
% \item Avergage IAT - majority write/general.
1024
% \item Total I/O - majority are general I/O.
1025
% \item Average Bytes - majority are writes.
1026
% \item Bytes Total - majority reads.
1027
% \item Tuple counts are close to same as session counts.
1028
%\end{itemize}
1029
1030
%Examination of the Response Time (RT) and Inter Arrival Times (IAT) revealed the speed and frequency with which metadata operations are performed, as well as the infrequency of individual users and sessions to interact with a given share.
1031
1032
%% NEED: Run the matlab curve fitting to complete this section of the writing
Duncan