summaryrefslogtreecommitdiffstats
path: root/doc/dsc-manual.tex
blob: 501d34ac7768a142c8daa370b635889fb0874dce (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
\documentclass{report}
\usepackage{epsfig}
\usepackage{path}
\usepackage{fancyvrb}

\def\dsc{{\sc dsc}}

\DefineVerbatimEnvironment%
  {MyVerbatim}{Verbatim}
  {frame=lines,framerule=0.8mm,fontsize=\small}

\renewcommand{\abstractname}{}

\begin{document}

\begin{titlepage}
\title{DSC Manual}
\author{Duane Wessels, Measurement Factory\\
Ken Keys, CAIDA\\
\\
http://dns.measurement-factory.com/tools/dsc/}
\date{\today}
\end{titlepage}

\maketitle

\begin{abstract}
\setlength{\parskip}{1ex}
\section{Copyright}

The DNS Statistics Collector (dsc)

Copyright 2003-2007 by The Measurement Factory, Inc., 2007-2008 by Internet
Systems Consortium, Inc., 2008-2019 by OARC, Inc.

{\em info@measurement-factory.com\/}, {\em info@isc.org\/}

\section{License}

{\dsc} is licensed under the terms of the BSD license:

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:

Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
Neither the name of The Measurement Factory nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.

\section{Contributors}
\begin{itemize}
\item Duane Wessels, Measurement Factory
\item Ken Keys, Cooperative Association for Internet Data Analysis
\item Sebastian Castro, New Zealand Registry Services
\end{itemize}
\end{abstract}


\tableofcontents

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\chapter{Introduction}

{\dsc} is a system for collecting and presenting statistics from
a busy DNS server.

\section{Components}

{\dsc} consists of the following components:
\begin{itemize}
\item A data collector
\item A data presenter, where data is archived and rendered
\item A method for securely transferring data from the collector
	to the presenter
\item Utilities and scripts that parse XML and archive files from the collector
\item Utilities and scripts that generate graphs and HTML pages
\end{itemize}

\subsection{The Collector}

The collector is a binary program, named {\tt dsc\/}, which snoops
on DNS messages.  It is written in C and uses {\em libpcap\/} for
packet capture.

{\tt dsc\/} uses a relatively simple configuration file called {\em
dsc.conf\/} to define certain parameters and options.  The configuration
file also determines the {\em datasets\/} that {\tt dsc\/} collects.

A Dataset is a 2-D array of counters of IP/DNS message properties.
You can define each dimension of the array independently.  For
example you might define a dataset categorized by DNS query type
along one dimension and TLD along the other.
{\tt dsc\/} dumps the datasets from memory to XML files every 60 seconds.

\subsection{XML Data Transfer}

You may run the {\dsc} collector on a remote machine.  That
is, the collector may run on a different machine than where the
data is archived and displayed.  {\dsc} includes some Perl and {\tt /bin/sh}
scripts to move XML files from collector to presenter.  One
technique uses X.509 certificates and a secure HTTP server.  The other
uses {\em rsync\/}, presumably over {\em ssh\/}.

\subsubsection{X.509/SSL}

To make this work, Apache/mod\_ssl should run on the machine where data
is archived and presented.
Data transfer is authenticated via SSL X.509 certificates.  A Perl
CGI script handles all PUT requests on the server.  If the client
certificate is allowed, XML files are stored in the appropriate
directory.

A shell script runs on the collector to upload the XML files.  It
uses {\tt curl\/}\footnote{http://curl.haxx.se} to establish an
HTTPS connection.  XML files are bundled together with {\tt tar\/}
before transfer to eliminate per-connection delays.
You could use {\tt scp\/} or {\tt rsync\/} instead of
{\tt curl\/} if you like.

\path|put-file.pl| is the script that accepts PUT requests on the
HTTP server.  The HTTP server validates the client's X.509 certificate.
If the certificate is invalid, the PUT request is denied.  This
script reads environment variables to get X.509 parameters.  The
uploaded-data is stored in a directory based on the X.509 Organizational
Unit (server) and Common Name fields (node).

\subsubsection{rsync/ssh}

This technique uses the {\em rsync\/} utility to transfer files.
You'll probably want to use {\em ssh\/} as the underlying transport,
although you can still use the less-secure {\em rsh\/} or native
rsync server transports if you like.

If you use {\em ssh\/} then you'll need to create passphrase-less
SSH keys so that the transfer can occur automatically.  You may
want to create special {\em dsc\/} userids on both ends as well.

\subsection{The Extractor}

The XML extractor is a Perl script that reads the XML files from
{\tt dsc\/}.  The extractor essentially converts the XML-structured
data to a format that is easier (faster) for the graphing tools to
parse.  Currently the extracted data files are line-based ASCII
text files.  Support for SQL databases is planned for the future.

\subsection{The Grapher}

{\dsc} uses {\em Ploticus\/}\footnote{http://ploticus.sourceforge.net/}
as the graphing engine.  A Perl module and CGI script read extracted
data files and generate Ploticus scriptfiles to generate plots.  Plots
are always generated on demand via the CGI application.

\path|dsc-grapher.pl| is the script that displays graphs from the
archived data.


\section{Architecture}

Figure~\ref{fig-architecture} shows the {\dsc} architecture.

\begin{figure}
\centerline{\psfig{figure=dsc-arch.eps,width=3.5in}}
\caption{\label{fig-architecture}The {\dsc} architecture.}
\end{figure}

Note that {\dsc} utilizes the concept of {\em servers\/} and {\em
nodes\/}.  A server is generally a logical service, which may
actually consist of multiple nodes.  Figure~\ref{fig-architecture}
shows six collectors (the circles) and two servers (the rounded
rectangles).  For a real-world example, consider a DNS root server.
IP Anycast allows a DNS root server to have geographically distributed
nodes that share a single IP address.  We call each instance a
{\em node\/} and all nodes sharing the single IP address belong
to the same {\em server\/}.

The {\dsc} collector program runs on or near\footnote{by
``near'' we mean that packets may be sniffed remotely via Ethernet taps, switch
port mirroring, or a SPAN port.} the remote nodes.  Its XML output
is transferred to the presentation machine via HTTPS PUTs (or something simpler
if you prefer).

The presentation machine includes an HTTP(S) server.  The extractor looks
for XML files PUT there by the collectors.  A CGI script also runs on
the HTTP server to display graphs and other information.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\chapter{Installing the Presenter}

You'll probably want to get the Presenter working before the Collector.
If you're using the secure XML data transfer, you'll need to
generate both client- and server-side X.509 certificates.

Installing the Presenter involves the following steps:
\begin{itemize}
\setlength{\itemsep}{0ex plus 0.5ex minus 0.0ex}
\item
	Install Perl dependencies
\item
	Install {\dsc} software
\item
	Create X.509 certificates (optional)
\item
	Set up a secure HTTP server (e.g., Apache and mod\_ssl)
\item
	Add some cron jobs
\end{itemize}


\section{Install Perl Dependencies}

{\dsc} uses Perl for the extractor and grapher components.  Chances are
that you'll need Perl-5.8, or maybe only Perl-5.6.  You'll also need
these readily available third-party Perl modules, which you
can find via CPAN:

\begin{itemize}
\setlength{\itemsep}{0ex plus 0.5ex minus 0.0ex}
	\item CGI-Untaint (CGI::Untaint)
	\item CGI.pm (CGI)
	\item Digest-MD5 (Digest::MD5)
	\item File-Flock (File::Flock)
	\item File-Spec (File::Spec)
	\item File-Temp (File::Temp)
	\item Geography-Countries (Geography::Countries)
	\item Hash-Merge (Hash::Merge)
	\item IP-Country (IP::Country)
	\item MIME-Base64 (MIME::Base64)
	\item Math-Calc-Units (Math::Calc::Units)
	\item Scalar-List-Utils (List::Util)
	\item Text-Template (Text::Template)
	\item URI (URI::Escape)
	\item XML-Simple (XML::Simple)
	\item Net-DNS-Resolver (Net::DNS::Resolver)

\end{itemize}

\noindent
Also note that XML::Simple requires XML::Parser, which in
turn requires the {\em expat\/} package.

\section{Install Ploticus}

{\dsc} uses Ploticus to generate plots and graphs.  You can find
this software at \verb|http://ploticus.sourceforge.net|.  The {\em
Download\/} page has links to some pre-compiled binaries and packages.
FreeBSD and NetBSD users can find Ploticus in the ports/packages
collection.


\section{Install {\dsc} Software}

All of the extractor and grapher tools are Perl or {\tt /bin/sh}
scripts, so there is no need to compile anything.  Still,
you should run {\tt make} first:

\begin{MyVerbatim}
% cd presenter
% make
\end{MyVerbatim}

If you see errors about missing Perl prerequisites, you may want
to correct those before continuing.

The next step is to install the files.  Recall that
\path|/usr/local/dsc| is the hard-coded installation prefix.
You must create it manually:

\begin{MyVerbatim}
% mkdir /usr/local/dsc
% make install
\end{MyVerbatim}

Note that {\dsc}'s Perl modules are installed in the
``site\_perl'' directory.  You'll probably need {\em root\/}
privileges to install files there.

\section{CGI Symbolic Links}

{\dsc} has a couple of CGI scripts that are installed
into \path|/usr/local/dsc/libexec|.  You should add symbolic
links from your HTTP server's \path|cgi-bin| directory to
these scripts.

Both of these scripts have been designed to be mod\_perl-friendly.

\begin{MyVerbatim}
% cd /usr/local/apache/cgi-bin
% ln -s /usr/local/dsc/libexec/put-file.pl
% ln -s /usr/local/dsc/libexec/dsc-grapher.pl
\end{MyVerbatim}

You can skip the \path|put-file.pl| link if you plan to use
{\em rsync\/} to transfer XML files.
If you cannot create symbolic links, you'll need to manually
copy the scripts to the appropriate directory.


\section{/usr/local/dsc/data}

\subsection{X.509 method}

This directory is where \path|put-file.pl| writes incoming XML
files.  It should have been created when you ran {\em make install\/} earlier.
XML files are actually placed in {\em server\/} and {\em
node\/} subdirectories based on the authorized client X.509 certificate
parameters.  If you want \path|put-file.pl| to automatically create
the subdirectories, the \path|data| directory must be writable by
the process owner:

\begin{MyVerbatim}
% chgrp nobody /usr/local/dsc/data/
% chmod 2775 /usr/local/dsc/data/
\end{MyVerbatim}

Alternatively, you can create {\em server\/} and {\em node\/} directories
in advance and make those writable.

\begin{MyVerbatim}
% mkdir /usr/local/dsc/data/x-root/
% mkdir /usr/local/dsc/data/x-root/blah/
% mkdir /usr/local/dsc/data/x-root/blah/incoming/
% chgrp nobody /usr/local/dsc/data/x-root/blah/
% chmod 2775 /usr/local/dsc/data/x-root/blah/incoming/
\end{MyVerbatim}

Make sure that \path|/usr/local/dsc/data/| is on a large partition with
plenty of free space.  You can make it a symbolic link to another
partition if necessary.  Note that a typical {\dsc} installation
for a large DNS root server requires about 4GB to hold a year's worth
of data.

\subsection{rsync Method}

The directory structure is the same as above (for X.509).  The only
differences are that:
\begin{itemize}
\item
	The {\em server\/}, {\em node\/}, and {\em incoming\/}
	directories must be made in advance.
\item
	The directories should be writable by the userid associated
	with the {\em rsync}/{\em ssh\/} connection.  You may want
	to create a dedicated {\em dsc\/} userid for this.
\end{itemize}


\section{/usr/local/dsc/var/log}

The \path|put-file.pl| script logs its activity to
\path|put-file.log| in this directory.  It should have been
created when you ran {\em make install\/} earlier.  The directory
should be writable by the HTTP server userid (usually {\em nobody\/}
or {\em www\/}).  Unfortunately the installation isn't fancy enough
to determine that userid yet, so you must change the ownership manually:

\begin{MyVerbatim}
% chgrp nobody /usr/local/dsc/var/log/
\end{MyVerbatim}

Furthermore, you probably want to make sure the log file does not
grow indefinitely.  For example, on FreeBSD we add this line to \path|/etc/newsyslog.conf|:

\begin{MyVerbatim}
/usr/local/dsc/var/log/put-file.log nobody:wheel        644  10    *    @T00  BN
\end{MyVerbatim}

You need not worry about this directory if you are using the
{\em rsync\/} upload method.

\section{/usr/local/dsc/cache}

This directory, also created by {\em make install\/} above, holds cached
plot images.  It also must be writable by the HTTP userid:

\begin{MyVerbatim}
% chgrp nobody /usr/local/dsc/cache/
\end{MyVerbatim}

\section{Cron Jobs}

{\dsc} requires two cron jobs on the Presenter.  The first
is the one that processes incoming XML files.  It is called
\path|refile-and-grok.sh|.  We recommend running it every
minute.  You also may want to run the jobs at a lowerer priority
with {\tt nice\/}.  Here is the cron job that we use:

\begin{MyVerbatim}
* * * * * /usr/bin/nice -10 /usr/local/dsc/libexec/refile-and-grok.sh
\end{MyVerbatim}

The other useful cron script is \path|remove-xmls.pl|.  It removes
XML files older than a specified number of days.  Since most of the
information in the XML files is archived into easier-to-parse
data files, you can remove the XML files after a few days.  This is
the job that we use:

\begin{MyVerbatim}
@midnight find /usr/local/dsc/data/ | /usr/local/dsc/libexec/remove-xmls.pl 7
\end{MyVerbatim}

\section{Data URIs}

{\dsc} uses ``Data URIs'' by default.  This is a URI where the
content is base-64 encoded into the URI string.  It allows us
to include images directly in HTML output, such that the browser
does not have to make additional HTTP requests for the images.
Data URIs may not work with some browsers.

To disable Data URIs, edit {\em presenter/perllib/DSC/grapher.pm\/}
and change this line:

\begin{verbatim}
        $use_data_uri = 1;
\end{verbatim}

to

\begin{verbatim}
        $use_data_uri = 0;
\end{verbatim}

Also make this symbolic link from your HTTP servers ``htdocs'' directory:

\begin{verbatim}
# cd htdocs
# ln -s /usr/local/dsc/share/html dsc
\end{verbatim}



%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\chapter{Configuring the {\dsc} Presenter}

This chapter describes how to create X.509 certificates and configure
Apache/mod\_ssl.  If you plan on using a different upload
technique (such as scp or rsync) you can skip these instructions.

\section{Generating X.509 Certificates}

We use X.509 certificates to authenticate both sides
of an SSL connection when uploading XML data files from
the collector to the presenter.

Certificate generation is a tricky thing.  We use three different
types of certificates:
\begin{enumerate}
\item A self-signed root CA certificate
\item A server certificate
\item Client certificates for each collector node
\end{enumerate}

In the client certificates
we use X.509 fields to store the collector's server and node name.
The Organizational Unit Name (OU) becomes the server name and
the Common Name (CN) becomes the node name.

The {\dsc} source code distribution includes some shell scripts
that we have
used to create X.509 certificates.  You can find them in the
\path|presenter/certs| directory.  Note these are not installed
into \path|/usr/local/dsc|.  You should edit \path|openssl.conf|
and enter the relevant information for your organization.

\subsection{Certificate Authority}

You may need to create a self-signed certificate authority if you
don't already have one.  The CA signs client and server certificates.
You will need to distribute the CA and client certificates to
collector sites.  Here is how to use our \path|create-ca-cert.sh|
script:

\begin{MyVerbatim}
% sh create-ca-cert.sh
CREATING CA CERT
Generating a 2048 bit RSA private key
..............................................................................
............+++
......+++
writing new private key to './private/cakey.pem'
Enter PEM pass phrase:
Verifying - Enter PEM pass phrase:
-----
\end{MyVerbatim}


\subsection{Server Certificate}

The server certificate is used by the HTTP server (Apache/mod\_ssl).
The clients will have a copy of the CA certificate so they
can validate the server's certificate when uploading XML files.
Use the \path|create-srv-cert.sh| script to create a server
certificate:

\begin{MyVerbatim}
% sh create-srv-cert.sh
CREATING SERVER REQUEST
Generating a 1024 bit RSA private key
..........................++++++
.....................................++++++
writing new private key to 'server/server.key'
Enter PEM pass phrase:
Verifying - Enter PEM pass phrase:
-----
You are about to be asked to enter information that will be incorporated
into your certificate request.
What you are about to enter is what is called a Distinguished Name or a DN.
There are quite a few fields but you can leave some blank
For some fields there will be a default value,
If you enter '.', the field will be left blank.
-----
Country Name (2 letter code) [AU]:US
State or Province Name (full name) [Some-State]:Colorado
Locality Name (eg, city) []:Boulder
Organization Name (eg, company) [Internet Widgits Pty Ltd]:The Measurement Factory, Inc
Organizational Unit Name (eg, section) []:DNS
Common Name (eg, YOUR name) []:dns.measurement-factory.com
Email Address []:wessels@measurement-factory.com

Please enter the following 'extra' attributes
to be sent with your certificate request
A challenge password []:
An optional company name []:
Enter pass phrase for server/server.key:
writing RSA key
CREATING SERVER CERT
Using configuration from ./openssl.conf
Enter pass phrase for ./private/cakey.pem:
Check that the request matches the signature
Signature ok
The Subject's Distinguished Name is as follows
countryName           :PRINTABLE:'US'
stateOrProvinceName   :PRINTABLE:'Colorado'
localityName          :PRINTABLE:'Boulder'
organizationName      :PRINTABLE:'The Measurement Factory, Inc'
organizationalUnitName:PRINTABLE:'DNS'
commonName            :PRINTABLE:'dns.measurement-factory.com'
emailAddress          :IA5STRING:'wessels@measurement-factory.com'
Certificate is to be certified until Jun  3 20:06:17 2013 GMT (3000 days)
Sign the certificate? [y/n]:y


1 out of 1 certificate requests certified, commit? [y/n]y
Write out database with 1 new entries
Data Base Updated
\end{MyVerbatim}

Note that the Common Name must match the hostname of the HTTP
server that receives XML files.

Note that the \path|create-srv-cert.sh| script rewrites the
server key file without the RSA password.  This allows your
HTTP server to start automatically without prompting for
the password.

The script leaves the server certificate and key in the \path|server|
directory.  You'll need to copy these over to the HTTP server config
directory as described later in this chapter.

\section{Client Certificates}

Generating client certificates is similar.  Remember that
the Organizational Unit Name and Common Name correspond to the
collector's {\em server\/} and {\em node\/} names.   For example:

\begin{MyVerbatim}
% sh create-clt-cert.sh
CREATING CLIENT REQUEST
Generating a 1024 bit RSA private key
................................++++++
..............++++++
writing new private key to 'client/client.key'
Enter PEM pass phrase:
Verifying - Enter PEM pass phrase:
-----
You are about to be asked to enter information that will be incorporated
into your certificate request.
What you are about to enter is what is called a Distinguished Name or a DN.
There are quite a few fields but you can leave some blank
For some fields there will be a default value,
If you enter '.', the field will be left blank.
-----
Country Name (2 letter code) [AU]:US
State or Province Name (full name) [Some-State]:California
Locality Name (eg, city) []:Los Angeles
Organization Name (eg, company) [Internet Widgits Pty Ltd]:Some DNS Server
Organizational Unit Name (eg, section) []:x-root
Common Name (eg, YOUR name) []:LAX
Email Address []:noc@example.com

Please enter the following 'extra' attributes
to be sent with your certificate request
A challenge password []:
An optional company name []:
CREATING CLIENT CERT
Using configuration from ./openssl.conf
Enter pass phrase for ./private/cakey.pem:
Check that the request matches the signature
Signature ok
The Subject's Distinguished Name is as follows
countryName           :PRINTABLE:'US'
stateOrProvinceName   :PRINTABLE:'California'
localityName          :PRINTABLE:'Los Angeles'
organizationName      :PRINTABLE:'Some DNS Server'
organizationalUnitName:PRINTABLE:'x-root  '
commonName            :PRINTABLE:'LAX'
emailAddress          :IA5STRING:'noc@example.com'
Certificate is to be certified until Jun  3 20:17:24 2013 GMT (3000 days)
Sign the certificate? [y/n]:y


1 out of 1 certificate requests certified, commit? [y/n]y
Write out database with 1 new entries
Data Base Updated
Enter pass phrase for client/client.key:
writing RSA key
writing RSA key
\end{MyVerbatim}

The client's key and certificate will be placed in a directory
based on the server and node names.  For example:

\begin{MyVerbatim}
% ls -l client/x-root/LAX
total 10
-rw-r--r--  1 wessels  wessels  3311 Mar 17 13:17 client.crt
-rw-r--r--  1 wessels  wessels   712 Mar 17 13:17 client.csr
-r--------  1 wessels  wessels   887 Mar 17 13:17 client.key
-rw-r--r--  1 wessels  wessels  1953 Mar 17 13:17 client.pem
\end{MyVerbatim}

The \path|client.pem| (and \path|cacert.pem|) files should be copied
to the collector machine.

\section{Apache Configuration}

\noindent
You need to configure Apache for SSL.  Here is what our configuration
looks like:

\begin{MyVerbatim}
SSLRandomSeed startup builtin
SSLRandomSeed startup file:/dev/random
SSLRandomSeed startup file:/dev/urandom 1024
SSLRandomSeed connect builtin
SSLRandomSeed connect file:/dev/random
SSLRandomSeed connect file:/dev/urandom 1024

<VirtualHost _default_:443>
DocumentRoot "/httpd/htdocs-ssl"
SSLEngine on
SSLCertificateFile /httpd/conf/SSL/server/server.crt
SSLCertificateKeyFile /httpd/conf/SSL/server/server.key
SSLCertificateChainFile /httpd/conf/SSL/cacert.pem

# For client-validation
SSLCACertificateFile /httpd/conf/SSL/cacert.pem
SSLVerifyClient require

SSLOptions +CompatEnvVars
Script PUT /cgi-bin/put-file.pl
</VirtualHost>
\end{MyVerbatim}

\noindent
Note the last line of the configuration specifies the CGI script
that accepts PUT requests.  The {\em SSLOptions\/}
line is necessary so that the CGI script receives certain HTTP
headers as environment variables.  Those headers/variables convey
the X.509 information to the script so it knows where to store
received XML files.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\chapter{Collector Installation}


A collector machine needs only the {\em dsc\/} binary, a configuration
file, and a couple of cron job scripts.

At this point, {\dsc} lacks certain niceties such as a \path|./configure|
script.   The installation prefix, \path|/usr/local/dsc| is currently
hard-coded.


\section{Prerequisites}

You'll need a C/C++ compiler to compile the {\tt dsc\/} source code.

If the collector and archiver are different systems, you'll need a
way to transfer data files.  We recommend that you use the {\tt
curl\/} HTTP/SSL client You may use another technique, such as {\tt
scp\/} or {\tt rsync\/} if you prefer.

\section{\tt Installation}

You can compile {\tt dsc\/} from the {\tt collector\/} directory:

\begin{MyVerbatim}
% cd collector
% make
\end{MyVerbatim}

Assuming there are no errors or problems during compilation, install
the {\tt dsc\/} binary and other scripts with:

\begin{MyVerbatim}
% make install
\end{MyVerbatim}

This installs five files:
\begin{Verbatim}
/usr/local/dsc/bin/dsc
/usr/local/dsc/etc/dsc.conf.sample
/usr/local/dsc/libexec/upload-prep.pl
/usr/local/dsc/libexec/upload-rsync.sh
/usr/local/dsc/libexec/upload-x509.sh
\end{Verbatim}

Of course, if you don't want to use the default installation
prefix, you can manually copy these files to a location
of your choosing.  If you do that, you'll also need to
edit the cron scripts to match your choice of pathnames, etc.

\section{Uploading XML Files}
\label{sec-install-collector-cron}

This section describes how XML files are transferred from
the collector to one or more Presenter systems.

As we'll see in the next chapter, each {\tt dsc} process
has its own {\em run directory\/}.  This is the directory
where {\tt dsc} leaves its XML files.  It usually has a
name like \path|/usr/local/dsc/run/NODENAME|\@.  XML files
are removed after they are successfully transferred.  If the
Presenter is unreachable, XML files accumulate here until
they can be transferred.  Make sure that you have
enough disk space to queue a lot of XML files in the
event of an outage.

In general we want to be able to upload XML files to multiple
presenters.  This is the reason behind the {\tt upload-prep.pl}
script.  This script runs every 60 seconds from cron:

\begin{MyVerbatim}
* * * * * /usr/local/dsc/libexec/upload-prep.pl
\end{MyVerbatim}

{\tt upload-prep.pl} looks for \path|dsc.conf| files in
\path|/usr/local/dsc/etc| by default.  For each config file
found, it cd's to the {\em run\_dir\/} and links\footnote{as in
``hard link'' made with \path|/bin/ln|.}
XML files to one or more upload directories.  The upload directories
are named \path|upload/dest1|, \path|upload/dest2|, and so on.

In order for all this to work, you must create the directories
in advance.   For example, if you are collecting stats on
your nameserver named {\em ns0\/}, and want to send the XML files
to two presenters (named oarc and archive), the directory structure
might look like:

\begin{MyVerbatim}
% set prefix=/usr/local/dsc
% mkdir $prefix/run
% mkdir $prefix/run/ns0
% mkdir $prefix/run/ns0/upload
% mkdir $prefix/run/ns0/upload/oarc
% mkdir $prefix/run/ns0/upload/archive
\end{MyVerbatim}

With that directory structure, the {\tt upload-prep.pl} script moves
XML files from the \path|ns0| directory to the two
upload directories, \path|oarc| and \path|archive|.

To actually transfer files to the presenter, use either
\path|upload-x509.sh| or \path|upload-rsync.sh|.

\subsection{upload-x509.sh}

This cron script is responsible for
actually transferring XML files from the upload directories
to the remote server.    It creates a {\em tar\/} archive
of XML files and then uploads it to the remote server with
{\tt curl}.  The script takes three commandline arguments:

\begin{MyVerbatim}
% upload-x509.sh NODE DEST URI
\end{MyVerbatim}

{\em NODE\/} must match the name of a directory under
\path|/usr/local/dsc/run|.  Similarly, {\em DEST\/} must match the
name of a directory under \path|/usr/local/dsc/run/NODE/upload|.
{\em URI\/} is the URL/URI that the data is uploaded to.  Usually
it is just an HTTPS URL with the name of the destination server.
We also recommend running this from cron every 60 seconds.  For
example:

\begin{MyVerbatim}
* * * * * /usr/local/dsc/libexec/upload-x509.sh ns0 oarc \
	https://collect.oarc.isc.org/
* * * * * /usr/local/dsc/libexec/upload-x509.sh ns0 archive \
	https://archive.example.com/
\end{MyVerbatim}

\path|upload-x509.sh| looks for X.509 certificates in
\path|/usr/local/dsc/certs|.  The client certificate should be named
\path|/usr/local/dsc/certs/DEST/NODE.pem| and the CA certificate
should be named
\path|/usr/local/dsc/certs/DEST/cacert.pem|.  Note that {\em DEST\/}
and {\em NODE\/} must match the \path|upload-x509.sh|
command line arguments.

\subsection{upload-rsync.sh}

This script can be used to transfer XML files files from the upload
directories to the remote server.  It uses {\em rsync\/} and
assumes that {\em rsync\/} will use {\em ssh\/} for transport.
This script also takes three arguments:

\begin{MyVerbatim}
% upload-rsync.sh NODE DEST RSYNC-DEST
\end{MyVerbatim}

Note that {\em DEST\/} is the name of the local ``upload'' directory
and {\em RSYNC-DEST\/} is an {\em rsync\/} destination (i.e., hostname and remote directory).
Here is how you might use it in a crontab:

\begin{MyVerbatim}
* * * * * /usr/local/dsc/libexec/upload-rsync.sh ns0 oarc \
	dsc@collect.oarc.isc.org:/usr/local/dsc/data/Server/ns0
* * * * * /usr/local/dsc/libexec/upload-rsync.sh ns0 archive \
	dsc@archive.oarc.isc.org:/usr/local/dsc/data/Server/ns0
\end{MyVerbatim}

Also note that \path|upload-rsync.sh| will actually store the remote
XML files in \path|incoming/YYYY-MM-DD| subdirectories.  That is,
if your {\em RSYNC-DEST\/} is \path|host:/usr/local/dsc/data/Server/ns0|
then files will actually be written to
\path|/usr/local/dsc/data/Server/ns0/incoming/YYYY-MM-DD| on {\em host},
where \path|YYYY-MM-DD| is replaced by the year, month, and date of the
XML files.  These subdirectories reduce filesystem pressure in the event
of backlogs.

{\em rsync\/} over {\em ssh\/} requires you to use RSA or DSA public keys
that do not have a passphrase.  If you do not want to use one of
{\em ssh\/}'s default identity files, you can create one specifically
for this script.  It should be named \path|dsc_uploader_id| (and
\path|dsc_uploader_id.pub|) in the \$HOME/.ssh directory of the user
that will be running the script.  For example, you can create it
with this command:

\begin{MyVerbatim}
% ssh-keygen -t dsa -C dsc-uploader -f $HOME/.ssh/dsc_uploader_id
\end{MyVerbatim}

Then add \path|dsc_uploader_id.pub| to the \path|authorized_keys|
file of the receiving userid on the presenter system.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\chapter{Configuring and Running the {\dsc} Collector}

\section{dsc.conf}

Before running {\tt dsc\/} you need to create a configuration file.
Note that configuration directive lines are terminated with a semi-colon.
The configuration file currently understands the following directives:

\begin{description}

\item[local\_address]

	Specifies the DNS server's local IP address.  It is used
	to determine the ``direction'' of an IP packet: sending,
	receiving, or other.  You may specify multiple local addresses
	by repeating the {\em local\_address} line any number of times.

	Example: {\tt local\_address 172.16.0.1;\/}
	Example: {\tt local\_address 2001:4f8:0:2::13;\/}

\item[run\_dir]

	A directory that should become {\tt dsc\/}'s current directory
	after it starts.  XML files will be written here, as will
	any core dumps.

	Example: {\tt run\_dir "/var/run/dsc";\/}

\item[minfree\_bytes]

	If the filesystem where {\tt dsc\/} writes its XML files
	does not have at least this much free space, then
	{\tt dsc\/} will not write the XML files.  This prevents
	{\tt dsc\/} from filling up the filesystem.  The XML
	files that would have been written are simply lost and
	cannot be receovered.  {\tt dsc\/} will begin writing
	XML files again when the filesystem has the necessary
	free space.

\item[bpf\_program]

	A Berkeley Packet Filter program string.  Normally you
	should leave this unset.  You may use this to further
	restrict the traffic seen by {\tt dsc\/}.  Note that {\tt
	dsc\/} currently has one indexer that looks at all IP
	packets.  If you specify something like {\em udp port 53\/}
	that indexer will not work.

	However, if you want to monitor multiple DNS servers with
	separate {\dsc} instances on one collector box, then you
	may need to use {\em bpf\_program} to make sure that each
	{\tt dsc} process sees only the traffic it should see.

	Note that this directive must go before the {\em interface\/}
	directive because {\tt dsc\/} makes only one pass through
	the configuration file and the BPF filter is set when the
	interface is initialized.

	Example: {\tt bpf\_program "dst host 192.168.1.1";\/}

\item[interface]

	The interface name to sniff packets from or a pcap file to
	read packets from.   You may specify multiple interfaces.

	Example:
	{\tt interface fxp0;\/}
	{\tt interface /path/to/dump.pcap;\/}

\item[bpf\_vlan\_tag\_byte\_order]

	{\tt dsc\/} knows about VLAN tags.  Some operating systems
	(FreeBSD-4.x) have a bug whereby the VLAN tag id is
	byte-swapped.  Valid values for this directive are {\tt
	host\/} and {\tt net\/} (the default).    Set this to {\tt
	host\/} if you suspect your operating system has the VLAN
	tag byte order bug.

	Example: {\tt bpf\_vlan\_tag\_byte\_order host;\/}

\item[match\_vlan]

	A list of VLAN identifiers (integers).  If set, only the
	packets belonging to these VLANs are counted.

	Example: {\tt match\_vlan 101 102;\/}

\item[qname\_filter]

	This directive allows you to define custom filters
	to match query names in DNS messages.  Please see
	Section~\ref{sec-qname-filter} for more information.

\item[dataset]

	This directive is the heart of {\dsc}.  However, it is also
	the most complex.
	To save time we recommend that you copy interesting-looking
	dataset definitions from \path|dsc.conf.sample|.  Comment
	out any that you feel are irrelevant or uninteresting.
	Later, as you become more familiar with {\dsc}, you may
	want to read the next chapter and add your own custom
	datasets.

\item[output\_format]

	Specify the output format, can be give multiple times to output in more then
	one format. Default output format is XML.

	Available formats are:
	- XML
	- JSON

	Example: {\tt output\_format JSON}
\end{description}


\section{A Complete Sample dsc.conf}

Here's how your entire {\em dsc.conf\/} file might look:

\begin{MyVerbatim}
#bpf_program
interface em0;

local_address 192.5.5.241;

run_dir "/usr/local/dsc/run/foo";

dataset qtype dns All:null Qtype:qtype queries-only;
dataset rcode dns All:null Rcode:rcode replies-only;
dataset opcode dns All:null Opcode:opcode queries-only;
dataset rcode_vs_replylen dns Rcode:rcode ReplyLen:msglen replies-only;
dataset client_subnet dns All:null ClientSubnet:client_subnet queries-only
        max-cells=200;
dataset qtype_vs_qnamelen dns Qtype:qtype QnameLen:qnamelen queries-only;
dataset qtype_vs_tld dns Qtype:qtype TLD:tld queries-only,popular-qtypes
        max-cells=200;
dataset certain_qnames_vs_qtype dns CertainQnames:certain_qnames
        Qtype:qtype queries-only;
dataset client_subnet2 dns Class:query_classification
        ClientSubnet:client_subnet queries-only max-cells=200;
dataset client_addr_vs_rcode dns Rcode:rcode ClientAddr:client
        replies-only max-cells=50;
dataset chaos_types_and_names dns Qtype:qtype Qname:qname
        chaos-class,queries-only;
dataset idn_qname dns All:null IDNQname:idn_qname queries-only;
dataset edns_version dns All:null EDNSVersion:edns_version queries-only;
dataset do_bit dns All:null D0:do_bit queries-only;
dataset rd_bit dns All:null RD:rd_bit queries-only;
dataset tc_bit dns All:null TC:tc_bit replies-only;
dataset idn_vs_tld dns All:null TLD:tld queries-only,idn-only;
dataset ipv6_rsn_abusers dns All:null ClientAddr:client
        queries-only,aaaa-or-a6-only,root-servers-n et-only max-cells=50;
dataset transport_vs_qtype dns Transport:transport Qtype:qtype queries-only;

dataset direction_vs_ipproto ip Direction:ip_direction IPProto:ip_proto
        any;
\end{MyVerbatim}

\section{Running {\tt dsc}}

{\tt dsc\/} accepts a single command line argument, which is
the name of the configuration file.  For example:

\begin{MyVerbatim}
% cd /usr/local/dsc
% bin/dsc etc/foo.conf
\end{MyVerbatim}

If you run {\tt ps} when {\tt dsc} is running, you'll see two processes:

\begin{MyVerbatim}
60494  ??  S      0:00.36 bin/dsc etc/foo.conf
69453  ??  Ss     0:10.65 bin/dsc etc/foo.conf
\end{MyVerbatim}

The first process simply forks off child processes every
60 seconds.  The child processes do the work of analyzing
and tabulating DNS messages.

Please use NTP or another technique to keep the collector's
clock synchronized to the correct time.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\chapter{Viewing {\dsc} Graphs}

To view {\dsc} data in a web browser, simply enter the
URL to the \path|dsc-grapher.pl| CGI.   But before you
do that, you'll need to create a grapher configuration file.

\path|dsc-grapher.pl| uses a simple configuration file to set certain
menu options.  This  configuration file is
\path|/usr/local/dsc/etc/dsc-grapher.cfg|.  You should find
a sample version in the same directory.  For example:

\begin{MyVerbatim}
server f-root pao1 sfo2
server isc senna+piquet
server tmf hq sc lgh
trace_windows 1hour 4hour 1day 1week 1month
accum_windows 1day 2days 3days 1week
timezone Asia/Tokyo
domain_list isc_tlds br nl ca cz il pt cl
domain_list isc_tlds sk ph hr ae bg is si za
valid_domains isc isc_tlds

\end{MyVerbatim}

\begin{figure}
\centerline{\psfig{figure=screenshot1.eps,width=6.5in}}
\caption{\label{fig-screenshot1}A sample graph}
\end{figure}

Refer to Figure~\ref{fig-screenshot1} to see how
the directives affect the visual display.
The following three directives should always be set in
the configuration file:

\begin{description}
\item[server]
	This directive tells \path|dsc-grapher.pl| to list
	the given server and its associated nodes in the
	``Servers/Nodes'' section of its navigation menu.
	You can repeat this directive for each server that
	the Presenter has.
\item[trace\_windows]
	Specifies the ``Time Scale'' menu options for
	trace-based plots.
\item[accum\_windows]
	Specifies the ``Time Scale'' menu options for
	``cumulative'' plots, such as the Classification plot.
\end{description}

Note that the \path|dsc-grapher.cfg| only affects what
may appear in the navigation window.  It does NOT prevent users
from entering other values in the URL parameters.  For example,
if you have data for a server/node in your
\path|/usr/local/dsc/data/| directory that is not listed in
\path|dsc-grapher.cfg|, a user may still be able to view that
data by manually setting the URL query parameters.

The configuration file accepts a number of optional directives
as well.  You may set these if you like, but they are not
required:

\begin{description}
\item[timezone]
	Sets the time zone for dates and times displayed in the
	graphs.
	You can use this if you want to override the system
	time zone.
	The value for this directive should be the name
	of a timezone entry in your system database (usually found
	in {\path|/usr/share/zoneinfo|}.
	For example, if your system time zone is set
	to UTC but you want the times displayed for the
	London timezone, you can set this directive to
	{\tt Europe/London\/}.
\item[domain\_list]
	This directive, along with {\em valid\_domains\/}, tell the
	presenter which domains a nameserver is authoritative for.
	That information is used in the TLDs subgraphs to differentiate
	requests for ``valid'' and ``invalid'' domains.

	The {\em domain\_list\/} creates a named list of domains.
	The first token is a name for the list, and the remaining
	tokens are domain names.  The directive may be repeated with
	the same list name, as shown in the above example.
\item[valid\_domains]
	This directive glues servers and domain\_lists together.  The
	first token is the name of a {\em server\/} and the second token is
	the name of a {\em domain\_list\/}.
\item[embargo]
	The {\em embargo\/} directive may be used to delay the
	availability of data via the presenter.  For example, you
	may have one instance of {\em dsc-grapher.pl\/} for internal
	use only (password protected, etc).  You may also have a
	second instance for third-parties where data is delayed by
	some amount of time, such as hours, days, or weeks.  The value
	of the {\em embargo\/} directive is the number of seconds which
	data availability should be delayed.  For example, if you set
	it to 604800, then viewers will not be able to see any data
	less than one week old.
\item[anonymize\_ip]
	When the {\em anonymize\_ip\/} directive is given, IP addresses
	in the display will be anonymized.  The anonymization algorithm
	is currently hard-coded and designed only for IPv4 addresses.
	It masks off the lower 24 bits and leaves only the first octet
	in place.
\item[hide\_nodes]
	When the {\em hide\_nodes\/} directive is given, the presenter
	will not display the list node names underneath the current
	server.  This might be useful if you have a number of nodes
	but only want viewers to see the server as a whole, without
	exposing the particular nodes in the cluster.  Note, however,
	that if someone already knows the name of a node they can
	hand-craft query terms in the URL to display the data for
	only that node.  In other words, the {\em hide\_nodes\/}
	only provides ``security through obscurity.''
\end{description}


The first few times you try \path|dsc-grapher.pl|, be sure to run
{\tt tail -f} on the HTTP server error.log file.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\chapter{{\dsc} Datasets}

A {\em dataset\/} is a 2-D array of counters.  For example, you
might have a dataset with ``Query Type'' along one dimension and
``Query Name Length'' on the other.  The result is a table that
shows the distribution of query name lengths for each query type.
For example:

\vspace{1ex}
\begin{center}
\begin{tabular}{l|rrrrrr}
Len & A & AAAA & A6 & PTR & NS & SOA \\
\hline
$\cdots$ & & & & & \\
11 & 14 & 8 & 7 & 11 & 2 & 0 \\
12 & 19 & 2 & 3 & 19 & 4 & 1 \\
$\cdots$ & & & & & & \\
255 & 0 & 0 & 0 & 0 & 0 & 0 \\
\hline
\end{tabular}
\end{center}
\vspace{1ex}

\noindent
A dataset is defined by the following parameters:
\begin{itemize}
\setlength{\itemsep}{0ex plus 0.5ex minus 0.0ex}
\item A name
\item A protocol layer (IP or DNS)
\item An indexer for the first dimension
\item An indexer for the second dimension
\item One or more filters
\item Zero or more options and parameters
\end{itemize}

\noindent
The {\em dataset\/} definition syntax in \path|dsc.conf| is:

{\tt dataset\/}
{\em name\/}
{\em protocol\/}
{\em Label1:Indexer1\/}
{\em Label2:Indexer2\/}
{\em filter\/}
{\em [parameters]\/};
\vspace{2ex}

\section{Dataset Name}

The dataset name is used in the filename for {\tt dsc\/}'s XML
files.  Although this is an opaque string in theory, the Presenter's
XML extractor routines must recognize the dataset name to properly
parse it.  The source code file
\path|presenter/perllib/DSC/extractor/config.pm| contains an entry
for each known dataset name.

\section{Protocol}

{\dsc} currently knows about two protocol layers: IP and DNS.
On the {\tt dataset\/} line they are written as {\tt ip\/} and {\tt dns\/}.


\section{Indexers}

An {\em indexer\/} is simply a function that transforms the attributes
of an IP/DNS message into an array index.  For some attributes the
transformation is straightforward.  For example, the ``Query Type''
indexer simply extracts the query type value from a DNS message and
uses this 16-bit value as the array index.

Other attributes are slightly more complicated.  For example, the
``TLD'' indexer extracts the TLD of the QNAME field of a DNS message
and maps it to an integer.  The indexer maintains a simple internal
table of TLD-to-integer mappings.  The actual integer values are
unimportant because the TLD strings, not the integers, appear in
the resulting XML data.

When you specify an indexer on a {\tt dataset\/} line, you must
provide both the name of the indexer and a label.  The Label appears
as an attribute in the XML output.  For example,
Figure~\ref{fig-sample-xml} shows the XML corresponding to this
{\em dataset\/} line:

\begin{MyVerbatim}
dataset the_dataset dns Foo:foo Bar:bar queries-only;
\end{MyVerbatim}

\begin{figure}
\begin{MyVerbatim}
<array name="the_dataset" dimensions="2" start_time="1091663940" ...
  <dimension number="1" type="Foo"/>
  <dimension number="2" type="Bar"/>
  <data>
    <Foo val="1">
      <Bar val="0" count="4"/>
      ...
      <Bar val="100" count="41"/>
    </Foo>
    <Foo val="2">
      ...
    </Foo>
  </data>
</array>
\end{MyVerbatim}
\caption{\label{fig-sample-xml}Sample XML output}
\end{figure}

In theory you are free to choose any label that you like, however,
the XML extractors look for specific labels.  Please use the labels
given for the indexers in Tables~\ref{tbl-dns-indexers}
and~\ref{tbl-ip-indexers}.

\subsection{IP Indexers}

\begin{table}
\begin{center}
\begin{tabular}{|lll|}
\hline
Indexer & Label & Description \\
\hline
ip\_direction & Direction & one of sent, recv, or other \\
ip\_proto & IPProto & IP protocol (icmp, tcp, udp) \\
ip\_version & IP version number (4, 6) \\
\hline
\end{tabular}
\caption{\label{tbl-ip-indexers}IP packet indexers}
\end{center}
\end{table}

{\dsc} includes only minimal support for collecting IP-layer
stats.  Mostly we are interested in finding out the mix of
IP protocols received by the DNS server.  It can also show us
if/when the DNS server is the subject of denial-of-service
attack.
Table~\ref{tbl-ip-indexers} shows the indexers for IP packets.
Here are their longer descriptions:

\begin{description}
\item[ip\_direction]
	One of three values: sent, recv, or else.  Direction is determined
	based on the setting for {\em local\_address\/} in the configuration file.
\item[ip\_proto]
	The IP protocol type, e.g.: tcp, udp, icmp.
	Note that the {\em bpf\_program\/} setting affects all traffic
	seen by {\dsc}.  If the program contains the word ``udp''
	then you won't see any counts for non-UDP traffic.
\item[ip\_version]
	The IP version number, e.g.: 4 or 6.  Can be used to compare how much
	traffic comes in via IPv6 compared to IPV4.
\end{description}

\subsection{IP Filters}

Currently there is only one IP protocol filter: {\tt any\/}.
It includes all received packets.


\subsection{DNS Indexers}

\begin{table}
\begin{center}
\begin{tabular}{|lll|}
\hline
Indexer & Label & Description \\
\hline
certain\_qnames & CertainQnames & Popular query names seen at roots \\
client\_subnet & ClientSubnet & The client's IP subnet (/24 for IPv4, /96 for IPv6) \\
client & ClientAddr & The client's IP address \\
do\_bit & DO & Whether the DO bit is on \\
edns\_version & EDNSVersion & The EDNS version number \\
idn\_qname & IDNQname & If the QNAME is in IDN format \\
msglen & MsgLen & The DNS message length \\
null & All & A ``no-op'' indexer \\
opcode & Opcode & DNS message opcode \\
qclass & - & Query class \\
qname & Qname & Full query name \\
qnamelen & QnameLen & Length of the query name \\
qtype & Qtype & DNS query type \\
query\_classification & Class & A classification for bogus queries \\
rcode & Rcode & DNS response code \\
rd\_bit & RD & Check if Recursion Desired bit set \\
tc\_bit & TC & Check if Truncated bit set \\
tld & TLD & TLD of the query name \\
transport & Transport & Transport protocol for the DNS message (UDP or TCP) \\
dns\_ip\_version & IPVersion & IP version of the packet carrying the DNS message \\
\hline
\end{tabular}
\caption{\label{tbl-dns-indexers}DNS message indexers}
\end{center}
\end{table}

Table~\ref{tbl-dns-indexers} shows the currently-defined indexers
for DNS messages, and here are their descriptions:

\begin{description}
\item[certain\_qnames]
	This indexer isolates the two most popular query names seen
	by DNS root servers: {\em localhost\/} and {\em
	[a--m].root-servers.net\/}.
\item[client\_subnet]
	Groups DNS messages together by the subnet of the
	client's IP address.  The subnet is maked by /24 for IPv4
	and by /96 for IPv6.  We use this to make datasets with
	large, diverse client populations more manageable and to
	provide a small amount of privacy and anonymization.
\item[client]
	The IP (v4 and v6) address of the DNS client.
\item[do\_bit]
	This indexer has only two values: 0 or 1.  It indicates
	whether or not the ``DO'' bit is set in a DNS query.  According to
	RFC 2335: {\em Setting the DO bit to one in a query indicates
	to the server that the resolver is able to accept DNSSEC
	security RRs.}
\item[edns\_version]
	The EDNS version number, if any, in a DNS query.  EDNS
	Version 0 is documented in RFC 2671.
\item[idn\_qname]
	This indexer has only two values: 0 or 1.  It returns 1
	when the first QNAME in the DNS message question section
	is an internationalized domain name (i.e., containing
	non-ASCII characters).  Such QNAMEs begin with the string
	{\tt xn--\/}.  This convention is documented in RFC 3490.
\item[msglen]
	The overall length (size) of the DNS message.
\item[null]
	A ``no-op'' indexer that always returns the same value.
	This can be used to effectively turn the 2-D table into a
	1-D array.
\item[opcode]
	The DNS message opcode is a four-bit field.  QUERY is the
	most common opcode.  Additional currently defined opcodes
	include: IQUERY, STATUS, NOTIFY, and UPDATE.
\item[qclass]
	The DNS message query class (QCLASS) is a 16-bit value.  IN
	is the most common query class.  Additional currently defined
	query class values include: CHAOS, HS, NONE, and ANY.
\item[qname]
	The full QNAME string from the first (and usually only)
	QNAME in the question section of a DNS message.
\item[qnamelen]
	The length of the first (and usually only) QNAME in a DNS
	message question section.  Note this is the ``expanded''
	length if the message happens to take advantage of DNS
	message ``compression.''
\item[qtype]
	The query type (QTYPE) for the first QNAME in the DNS message
	question section.  Well-known query types include: A, AAAA,
	A6, CNAME, PTR, MX, NS, SOA, and ANY.
\item[query\_classification]
	A stateless classification of ``bogus'' queries:
	\begin{itemize}
	\setlength{\itemsep}{0ex plus 0.5ex minus 0.0ex}
	\item non-auth-tld: when the TLD is not one of the IANA-approved TLDs.
	\item root-servers.net: a query for a root server IP address.
	\item localhost: a query for the localhost IP address.
	\item a-for-root: an A query for the DNS root (.).
	\item a-for-a: an A query for an IPv4 address.
	\item rfc1918-ptr: a PTR query for an RFC 1918 address.
	\item funny-class: a query with an unknown/undefined query class.
	\item funny-qtype: a query with an unknown/undefined query type.
	\item src-port-zero: when the UDP message's source port equals zero.
	\item malformed: a malformed DNS message that could not be entirely parsed.
	\end{itemize}
\item[rcode]
	The RCODE value in a DNS response.  The most common response
	codes are 0 (NO ERROR) and 3 (NXDOMAIN).
\item[rd\_bit]
	This indexer returns 1 if the RD (recursion desired) bit is
	set in the query.  Usually only stub resolvers set the RD bit.
	Usually authoritative servers do not offer recursion to their
	clients.
\item[tc\_bit]
	This indexer returns 1 if the TC (truncated) bit is
	set (in a response).  An authoritative server sets the TC bit
	when the entire response won't fit into a UDP message.
\item[tld]
	the TLD of the first QNAME in a DNS message's question section.
\item[transport]
	Indicates whether the DNS message is carried via UDP or TCP\@.
\item[dns\_ip\_version]
	The IP version number that carried the DNS message.
\end{description}

\subsection{DNS Filters}

You must specify one or more of the following filters (separated by commas) on
the {\tt dataset\/} line:

\begin{description}
\item[any]
	The no-op filter, counts all messages.
\item[queries-only]
	Count only DNS query messages.  A query is a DNS message
	where the QR bit is set to 0.
\item[replies-only]
	Count only DNS response messages.  A query is a DNS message
        where the QR bit is set to 1.
\item[popular-qtypes]
	Count only DNS messages where the query type is one of:
	A, NS, CNAME, SOA, PTR, MX, AAAA, A6, ANY.
\item[idn-only]
	Count only DNS messages where the query name is in the
	internationalized domain name format.
\item[aaaa-or-a6-only]
	Count only DNS Messages where the query type is AAAA or A6.
\item[root-servers-net-only]
	Count only DNS messages where the query name is within
	the {\em root-servers.net\/} domain.
\item[chaos-class]
	Counts only DNS messages where QCLASS is equal to
	CHAOS (3).  The CHAOS class is generally used
	for only the special {\em hostname.bind\/} and
	{\em version.bind\/} queries.
\end{description}

\noindent
Note that multiple filters are ANDed together.  That is, they
narrow the input stream, rather than broaden it.

In addition to these pre-defined filters, you can add your own
custom filters.

\subsubsection{qname\_filter}
\label{sec-qname-filter}

The {\em qname\_filter} directive defines a new
filter that uses regular expression matching on the QNAME field of
a DNS message.  This may be useful if you have a server that is
authoritative for a number of zones, but you want to limit
your measurements to a small subset.  The {\em qname\_filter} directive
takes two arguments: a name for the filter and a regular expression.
For example:

\begin{MyVerbatim}
qname_filter MyFilterName example\.(com|net|org)$ ;
\end{MyVerbatim}

This filter matches queries (and responses) for names ending with
{\em example.com\/}, {\em example.net\/}, and {\em example.org\/}.
You can reference the named filter in the filters part of a {\em
dataset\/} line.  For example:

\begin{MyVerbatim}
dataset qtype dns All:null Qtype:qtype queries-only,MyFilterName;
\end{MyVerbatim}

\subsection{Parameters}
\label{sec-dataset-params}

\noindent
{\tt dsc\/} currently supports the following optional parameters:

\begin{description}
\item[min-count={\em NN\/}]
	Cells with counts less than {\em NN\/} are not included in
	the output.  Instead, they are aggregated into the special
	values {\tt -:SKIPPED:-\/} and {\tt -:SKIPPED\_SUM:-\/}.
	This helps reduce the size of datasets with a large number
	of small counts.
\item[max-cells={\em NN\/}]
	A different, perhaps better, way of limiting the size
	of a dataset.  Instead of trying to determine an appropriate
	{\em min-count\/} value in advance, {\em max-cells\/}
	allows you put a limit on the number of cells to
	include for the second dataset dimension.  If the dataset
	has 9 possible first-dimension values, and you specify
	a {\em max-cell\/} count of 100, then the dataset will not
	have more than 900 total values.  The cell values are sorted
	and the top {\em max-cell\/} values are output.  Values
	that fall below the limit are aggregated into the special
	{\tt -:SKIPPED:-\/} and {\tt -:SKIPPED\_SUM:-\/} entries.
\end{description}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\chapter{Data Storage}

\section{XML Structure}

A dataset XML file has the following structure:

\begin{MyVerbatim}
<array name="dataset-name" dimensions="2" start_time="unix-seconds"
        stop_time="unix-seconds">
  <dimension number="1" type="Label1"/>
  <dimension number="2" type="Label2"/>
  <data>
    <Label1 val="D1-V1">
      <Label2 val="D2-V1" count="N1"/>
      <Label2 val="D2-V2" count="N2"/>
      <Label2 val="D2-V3" count="N3"/>
    </Label1>
    <Label1 val="D1-V2">
      <Label2 val="D2-V1" count="N1"/>
      <Label2 val="D2-V2" count="N2"/>
      <Label2 val="D2-V3" count="N3"/>
    </Label1>
  </data>
</array>
\end{MyVerbatim}

\noindent
{\em dataset-name\/},
{\em Label1\/}, and
{\em Label2\/} come from the dataset definition in {\em dsc.conf\/}.

The {\em start\_time\/} and {\em stop\_time\/} attributes
are given in Unix seconds.  They are normally 60-seconds apart.
{\tt dsc} usually starts a new measurement interval on 60 second
boundaries. That is:

\begin{equation}
stop\_time \bmod{60} == 0
\end{equation}

The LABEL1 VAL attributes ({\em D1-V1\/}, {\em D1-V2\/}, etc) are
values for the first dimension indexer.
Similarly, the LABEL2 VAL attributes ({\em D2-V1\/}, {\em D2-V2\/},
{\em D2-V3\/}) are values for the second dimension indexer.
For some indexers these
values are numeric, for others they are strings.  If the value
contains certain non-printable characters, the string is base64-encoded
and the optional BASE64 attribute is set to 1.

There are two special VALs that help keep large datasets down
to a reasonable size: {\tt -:SKIPPED:-\/}  and {\tt -:SKIPPED\_SUM:-\/}.
These may be present on datasets that use the {\em min-count\/}
and {\em max-cells\/} parameters (see Section~\ref{sec-dataset-params}).
{\tt -:SKIPPED:-\/} is the number of cells that were not included
in the XML output.  {\tt -:SKIPPED\_SUM:-\/}, on the other hand, is the
sum of the counts for all the skipped cells.

Note that ``one-dimensional datasets'' still use two dimensions in
the XML file.  The first dimension type and value will be ``All'',
as shown in the example below.

The {\em count\/} values are always integers.  If the count for
a particular tuple is zero, it should not be included in the
XML file.

Note that the contents of the XML file do not indicate
where it came from.  In particular, the server and node that
it came from are not present.  Instead, DSC relies on the
presenter to store XML files in a directory hierarchy
with the server and node as directory names.


\noindent
Here is a short sample XML file with real content:
\begin{MyVerbatim}
<array name="rcode" dimensions="2" start_time="1154649600"
        stop_time="1154649660">
  <dimension number="1" type="All"/>
  <dimension number="2" type="Rcode"/>
  <data>
    <All val="ALL">
      <Rcode val="0" count="70945"/>
      <Rcode val="3" count="50586"/>
      <Rcode val="4" count="121"/>
      <Rcode val="1" count="56"/>
      <Rcode val="5" count="44"/>
    </All>
  </data>
</array>
\end{MyVerbatim}

\noindent
Please see
\path|http://dns.measurement-factory.com/tools/dsc/sample-xml/|
for more sample XML files.

The XML is not very strict and might cause XML purists to cringe.
{\tt dsc} writes the XML files the old-fashioned way (with printf())
and reads them with Perl's XML::Simple module.
Here is a possibly-valid DTD for the dataset XML format.
Note, however, that the {\em LABEL1\/}
and {\em LABEL2\/} strings are different
for each dataset:

\begin{MyVerbatim}
<!DOCTYPE ARRAY [

<!ELEMENT ARRAY (DIMENSION+, DATA))>
<!ELEMENT DIMENSION>
<!ELEMENT DATA (LABEL1+)>
<!ELEMENT LABEL1 (LABEL2+)>

<!ATTLIST ARRAY NAME CDATA #REQUIRED>
<!ATTLIST ARRAY DIMENSIONS CDATA #REQUIRED>
<!ATTLIST ARRAY START_TIME CDATA #REQUIRED>
<!ATTLIST ARRAY STOP_TIME CDATA #REQUIRED>
<!ATTLIST DIMENSION NUMBER CDATA #REQUIRED>
<!ATTLIST DIMENSION TYPE CDATA #REQUIRED>
<!ATTLIST LABEL1 VAL CDATA #REQUIRED>
<!ATTLIST LABEL2 VAL CDATA #REQUIRED>
<!ATTLIST LABEL2 COUNT CDATA #REQUIRED>

]>
\end{MyVerbatim}

\subsection{XML File Naming Conventions}

{\tt dsc\/} relies on certain file naming conventions for XML files.
The file name should be of the format:

\begin{quote}
{\em timestamp\/}.dscdata.xml
\end{quote}

\noindent
For example:

\begin{quote}
1154649660.dscdata.xml
\end{quote}

NOTE: Versions of DSC prior to 2008-01-30 used a different naming
convention.  Instead of ``dscdata'' the XML file was named after
the dataset that generated the data.  The current XML extraction
code still supports the older naming convention for backward compatibility.
If the second component of the XML file name is not ``dscdata'' then
the extractor assume it is a dataset name.

\noindent
Dataset names come from {\em dsc.conf\/}, and should match the NAME
attribute of the ARRAY tag inside the XML file.  The timestamp is in
Unix epoch seconds and is usually the same as the {\em stop\_time\/}
value.


\section{JSON Structure}

The JSON structure mimics the XML structure so that elements are the same.

\begin{MyVerbatim}
{
  "name": "dataset-name",
  "start_time": unix-seconds,
  "stop_time": unix-seconds,
  "dimensions": [ "Label1", "Label2" ],
  "data": [
    {
      "Label1": "D1-V1",
      "Label2": [
        { "val": "D2-V1", "count": N1 },
        { "val": "D2-V2", "count": N2 },
        { "val": "D2-V3", "count": N3 }
      ]
    },
    {
      "Label1": "D1-V1-base64",
      "base64": true,
      "Label2": [
        { "val": "D2-V1", "count": N1 },
        { "val": "D2-V2-base64", "base64": true, "count": N2 },
        { "val": "D2-V3", "count": N3 }
      ]
    }
  ]
}
\end{MyVerbatim}


\section{Archived Data Format}

{\dsc} actually uses four different file formats for archived
datasets.  These are all text-based and designed to be quickly
read from, and written to, by Perl scripts.

\subsection{Format 1}

\noindent
\begin{tt}time $k1$ $N_{k1}$ $k2$ $N_{k2}$ $k3$ $N_{k3}$ ...
\end{tt}

\vspace{1ex}\noindent
This is a one-dimensional time-series format.\footnote{Which means
it can only be used for datasets where one of the indexers is set
to the Null indexer.}  The first column is a timestamp (unix seconds).
The remaining space-separated fields are key-value pairs.  For
example:

\begin{MyVerbatim}
1093219980 root-servers.net 122 rfc1918-ptr 112 a-for-a 926 funny-qclass 16
1093220040 root-servers.net 121 rfc1918-ptr 104 a-for-a 905 funny-qclass 15
1093220100 root-servers.net 137 rfc1918-ptr 116 a-for-a 871 funny-qclass 12
\end{MyVerbatim}

\subsection{Format 2}

\noindent
\begin{tt}time $j1$ $k1$:$N_{j1,k1}$:$k2$:$N_{j1,k2}$:... $j2$ $k1$:$N_{j2,k1}$:$k2$:$N_{j2,k2}$:... ...
\end{tt}

\vspace{1ex}\noindent
This is a two-dimensional time-series format.  In the above,
$j$ represents the first dimension indexer and $k$ represents
the second.  Key-value pairs for the second dimension are
separated by colons, rather than space.  For example:

\begin{MyVerbatim}
1093220160 recv icmp:2397:udp:136712:tcp:428 sent icmp:819:udp:119191:tcp:323
1093220220 recv icmp:2229:udp:124708:tcp:495 sent icmp:716:udp:107652:tcp:350
1093220280 recv udp:138212:icmp:2342:tcp:499 sent udp:120788:icmp:819:tcp:364
1093220340 recv icmp:2285:udp:137107:tcp:468 sent icmp:733:udp:118522:tcp:341
\end{MyVerbatim}

\subsection{Format 3}

\noindent
\begin{tt}$k$ $N_{k}$
\end{tt}

\vspace{1ex}\noindent
This format is used for one-dimensional datasets where the key space
is (potentially) very large.  That is, putting all the key-value pairs
on a single line would result in a very long line in the datafile.
Furthermore, for these larger datasets, it is prohibitive to
store the data as a time series.  Instead the counters are incremented
over time.  For example:

\begin{MyVerbatim}
10.0.160.0 3024
10.0.20.0 92
10.0.244.0 5934
\end{MyVerbatim}

\subsection{Format 4}

\noindent
\begin{tt}$j$ $k$ $N_{j,k}$
\end{tt}

\vspace{1ex}\noindent
This format is used for two-dimensional datasets where one or both
key spaces are very large.  Again, counters are incremented over
time, rather than storing the data as a time series.
For example:

\begin{MyVerbatim}
10.0.0.0 non-auth-tld 105
10.0.0.0 ok 37383
10.0.0.0 rfc1918-ptr 5941
10.0.0.0 root-servers.net 1872
10.0.1.0 a-for-a 6
10.0.1.0 non-auth-tld 363
10.0.1.0 ok 144
\end{MyVerbatim}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\chapter{Bugs}

\begin{itemize}

\item
	Seems too confusing to have an opaque name for indexers in
	dsc.conf dataset line.  The names are pre-determined anyway
	since they must match what the XML extractors look for.
\item
	Also stupid to have indexer names and a separate ``Label'' for
	the XML file.

\item
	{\dsc} perl modules are installed in the ``site\_perl'' directory
	but they should probably be installed under /usr/local/dsc.

\item
	{\dsc} collector silently drops UDP frags

\end{itemize}

\end{document}