1# Copyright (c) 2003-2004 The Regents of The University of Michigan
2# All rights reserved.
3#
4# Redistribution and use in source and binary forms, with or without
5# modification, are permitted provided that the following conditions are
6# met: redistributions of source code must retain the above copyright
7# notice, this list of conditions and the following disclaimer;
8# redistributions in binary form must reproduce the above copyright
9# notice, this list of conditions and the following disclaimer in the
10# documentation and/or other materials provided with the distribution;
11# neither the name of the copyright holders nor the names of its
12# contributors may be used to endorse or promote products derived from
13# this software without specific prior written permission.
14#
15# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26#
27# Authors: Nathan Binkert
28
29import MySQLdb
30
31class MyDB(object):
32    def __init__(self, options):
33        self.name = options.db
34        self.host = options.host
35        self.user = options.user
36        self.passwd = options.passwd
37        self.mydb = None
38        self.cursor = None
39
40    def admin(self):
41        self.close()
42        self.mydb = MySQLdb.connect(db='mysql', host=self.host, user=self.user,
43                                    passwd=self.passwd)
44        self.cursor = self.mydb.cursor()
45
46    def connect(self):
47        self.close()
48        self.mydb = MySQLdb.connect(db=self.name, host=self.host,
49                                    user=self.user, passwd=self.passwd)
50        self.cursor = self.mydb.cursor()
51
52    def close(self):
53        if self.mydb is not None:
54            self.mydb.close()
55        self.cursor = None
56
57    def query(self, sql):
58        self.cursor.execute(sql)
59
60    def drop(self):
61        self.query('DROP DATABASE IF EXISTS %s' % self.name)
62
63    def create(self):
64        self.query('CREATE DATABASE %s' % self.name)
65
66    def populate(self):
67        #
68        # Each run (or simulation) gets its own entry in the runs table to
69        # group stats by where they were generated
70        #
71        # COLUMNS:
72        #   'id' is a unique identifier for each run to be used in other
73        #       tables.
74        #   'name' is the user designated name for the data generated.  It is
75        #       configured in the simulator.
76        #   'user' identifies the user that generated the data for the given
77        #       run.
78        #   'project' another name to identify runs for a specific goal
79        #   'date' is a timestamp for when the data was generated.  It can be
80        #       used to easily expire data that was generated in the past.
81        #   'expire' is a timestamp for when the data should be removed from
82        #       the database so we don't have years worth of junk.
83        #
84        # INDEXES:
85        #   'run' is indexed so you can find out details of a run if the run
86        #       was retreived from the data table.
87        #   'name' is indexed so that two all run names are forced to be unique
88        #
89        self.query('''
90        CREATE TABLE runs(
91            rn_id	SMALLINT UNSIGNED	NOT NULL AUTO_INCREMENT,
92            rn_name	VARCHAR(200)		NOT NULL,
93            rn_sample	VARCHAR(32)		NOT NULL,
94            rn_user	VARCHAR(32)		NOT NULL,
95            rn_project	VARCHAR(100)            NOT NULL,
96            rn_date	TIMESTAMP		NOT NULL,
97            rn_expire	TIMESTAMP               NOT NULL,
98            PRIMARY KEY (rn_id),
99            UNIQUE (rn_name,rn_sample)
100        ) TYPE=InnoDB''')
101
102        #
103        # The stat table gives us all of the data for a particular stat.
104        #
105        # COLUMNS:
106        #   'stat' is a unique identifier for each stat to be used in other
107        #       tables for references.
108        #   'name' is simply the simulator derived name for a given
109        #       statistic.
110        #   'descr' is the description of the statistic and what it tells
111        #       you.
112        #   'type' defines what the stat tells you.  Types are:
113        #       SCALAR: A simple scalar statistic that holds one value
114        #       VECTOR: An array of statistic values.  Such a something that
115        #           is generated per-thread.  Vectors exist to give averages,
116        #	     pdfs, cdfs, means, standard deviations, etc across the
117        #           stat values.
118        #       DIST: Is a distribution of data.  When the statistic value is
119        #	     sampled, its value is counted in a particular bucket.
120        #           Useful for keeping track of utilization of a resource.
121        #           (e.g. fraction of time it is 25% used vs. 50% vs. 100%)
122        #       VECTORDIST: Can be used when the distribution needs to be
123        #	     factored out into a per-thread distribution of data for
124        #	     example.  It can still be summed across threads to find
125        #           the total distribution.
126        #       VECTOR2D: Can be used when you have a stat that is not only
127        #           per-thread, but it is per-something else.  Like
128        #           per-message type.
129        #       FORMULA: This statistic is a formula, and its data must be
130        #	     looked up in the formula table, for indicating how to
131        #           present its values.
132        #   'subdata' is potentially used by any of the vector types to
133        #       give a specific name to all of the data elements within a
134        #       stat.
135        #   'print' indicates whether this stat should be printed ever.
136        #       (Unnamed stats don't usually get printed)
137        #   'prereq' only print the stat if the prereq is not zero.
138        #   'prec' number of decimal places to print
139        #   'nozero' don't print zero values
140        #   'nonan' don't print NaN values
141        #   'total' for vector type stats, print the total.
142        #   'pdf' for vector type stats, print the pdf.
143        #   'cdf' for vector type stats, print the cdf.
144        #
145        #   The Following are for dist type stats:
146        #   'min' is the minimum bucket value. Anything less is an underflow.
147        #   'max' is the maximum bucket value. Anything more is an overflow.
148        #   'bktsize' is the approximate number of entries in each bucket.
149        #   'size' is the number of buckets. equal to (min/max)/bktsize.
150        #
151        # INDEXES:
152        #   'stat' is indexed so that you can find out details about a stat
153        #       if the stat id was retrieved from the data table.
154        #   'name' is indexed so that you can simply look up data about a
155        #       named stat.
156        #
157        self.query('''
158        CREATE TABLE stats(
159            st_id	SMALLINT UNSIGNED	NOT NULL AUTO_INCREMENT,
160            st_name	VARCHAR(255)		NOT NULL,
161            st_descr	TEXT			NOT NULL,
162            st_type	ENUM("SCALAR", "VECTOR", "DIST", "VECTORDIST",
163                "VECTOR2D", "FORMULA")	NOT NULL,
164            st_print	BOOL			NOT NULL,
165            st_prereq	SMALLINT UNSIGNED	NOT NULL,
166            st_prec	TINYINT			NOT NULL,
167            st_nozero	BOOL			NOT NULL,
168            st_nonan	BOOL			NOT NULL,
169            st_total	BOOL			NOT NULL,
170            st_pdf	BOOL			NOT NULL,
171            st_cdf	BOOL			NOT NULL,
172            st_min	DOUBLE			NOT NULL,
173            st_max	DOUBLE			NOT NULL,
174            st_bktsize	DOUBLE			NOT NULL,
175            st_size	SMALLINT UNSIGNED	NOT NULL,
176            PRIMARY KEY (st_id),
177            UNIQUE (st_name)
178        ) TYPE=InnoDB''')
179
180        #
181        # This is the main table of data for stats.
182        #
183        # COLUMNS:
184        #   'stat' refers to the stat field given in the stat table.
185        #
186        #   'x' referrs to the first dimension of a multi-dimensional stat. For
187        #       a vector, x will start at 0 and increase for each vector
188        #       element.
189        #       For a distribution:
190        #       -1: sum (for calculating standard deviation)
191        #       -2: sum of squares (for calculating standard deviation)
192        #       -3: total number of samples taken (for calculating
193        #           standard deviation)
194        #       -4: minimum value
195        #       -5: maximum value
196        #       -6: underflow
197        #       -7: overflow
198        #   'y' is used by a VECTORDIST and the VECTOR2D to describe the second
199        #       dimension.
200        #   'run' is the run that the data was generated from.  Details up in
201        #       the run table
202        #   'tick' is a timestamp generated by the simulator.
203        #   'data' is the actual stat value.
204        #
205        # INDEXES:
206        #   'stat' is indexed so that a user can find all of the data for a
207        #       particular stat. It is not unique, because that specific stat
208        #       can be found in many runs and samples, in addition to
209        #       having entries for the mulidimensional cases.
210        #   'run' is indexed to allow a user to remove all of the data for a
211        #       particular execution run.  It can also be used to allow the
212        #       user to print out all of the data for a given run.
213        #
214        self.query('''
215        CREATE TABLE data(
216            dt_stat	SMALLINT UNSIGNED	NOT NULL,
217            dt_x	SMALLINT		NOT NULL,
218            dt_y	SMALLINT		NOT NULL,
219            dt_run	SMALLINT UNSIGNED	NOT NULL,
220            dt_tick	BIGINT UNSIGNED		NOT NULL,
221            dt_data	DOUBLE			NOT NULL,
222            INDEX (dt_stat),
223            INDEX (dt_run),
224            UNIQUE (dt_stat,dt_x,dt_y,dt_run,dt_tick)
225        ) TYPE=InnoDB;''')
226
227        #
228        # Names and descriptions for multi-dimensional stats (vectors, etc.)
229        # are stored here instead of having their own entry in the statistics
230        # table. This allows all parts of a single stat to easily share a
231        # single id.
232        #
233        # COLUMNS:
234        #   'stat' is the unique stat identifier from the stat table.
235        #   'x' is the first dimension for multi-dimensional stats
236        #       corresponding to the data table above.
237        #   'y' is the second dimension for multi-dimensional stats
238        #       corresponding to the data table above.
239        #   'name' is the specific subname for the unique stat,x,y combination.
240        #   'descr' is the specific description for the uniqe stat,x,y
241        #        combination.
242        #
243        # INDEXES:
244        #   'stat' is indexed so you can get the subdata for a specific stat.
245        #
246        self.query('''
247        CREATE TABLE subdata(
248            sd_stat	SMALLINT UNSIGNED	NOT NULL,
249            sd_x	SMALLINT		NOT NULL,
250            sd_y	SMALLINT		NOT NULL,
251            sd_name	VARCHAR(255)		NOT NULL,
252            sd_descr	TEXT,
253            UNIQUE (sd_stat,sd_x,sd_y)
254        ) TYPE=InnoDB''')
255
256
257        #
258        # The formula table is maintained separately from the data table
259        # because formula data, unlike other stat data cannot be represented
260        # there.
261        #
262        # COLUMNS:
263        #   'stat' refers to the stat field generated in the stat table.
264        #   'formula' is the actual string representation of the formula
265        #       itself.
266        #
267        # INDEXES:
268        #   'stat' is indexed so that you can just look up a formula.
269        #
270        self.query('''
271        CREATE TABLE formulas(
272            fm_stat	SMALLINT UNSIGNED	NOT NULL,
273            fm_formula	BLOB			NOT NULL,
274            PRIMARY KEY(fm_stat)
275        ) TYPE=InnoDB''')
276
277        #
278        # Each stat used in each formula is kept in this table.  This way, if
279        # you want to print out a particular formula, you can simply find out
280        # which stats you need by looking in this table.  Additionally, when
281        # you remove a stat from the stats table and data table, you remove
282        # any references to the formula in this table.  When a formula is no
283        # longer referred to, you remove its entry.
284        #
285        # COLUMNS:
286        #   'stat' is the stat id from the stat table above.
287        #   'child' is the stat id of a stat that is used for this formula.
288        #       There may be many children for any given 'stat' (formula)
289        #
290        # INDEXES:
291        #   'stat' is indexed so you can look up all of the children for a
292        #       particular stat.
293        #   'child' is indexed so that you can remove an entry when a stat is
294        #       removed.
295        #
296        self.query('''
297        CREATE TABLE formula_ref(
298            fr_stat	SMALLINT UNSIGNED	NOT NULL,
299            fr_run	SMALLINT UNSIGNED	NOT NULL,
300            UNIQUE (fr_stat,fr_run),
301            INDEX (fr_stat),
302            INDEX (fr_run)
303        ) TYPE=InnoDB''')
304
305        # COLUMNS:
306        #   'event' is the unique event id from the event_desc table
307        #   'run' is simulation run id that this event took place in
308        #   'tick' is the tick when the event happened
309        #
310        # INDEXES:
311        #   'event' is indexed so you can look up all occurences of a
312        #       specific event
313        #   'run' is indexed so you can find all events in a run
314        #   'tick' is indexed because we want the unique thing anyway
315        #   'event,run,tick' is unique combination
316        self.query('''
317        CREATE TABLE events(
318            ev_event	SMALLINT UNSIGNED	NOT NULL,
319            ev_run	SMALLINT UNSIGNED	NOT NULL,
320            ev_tick	BIGINT   UNSIGNED       NOT NULL,
321            INDEX(ev_event),
322            INDEX(ev_run),
323            INDEX(ev_tick),
324            UNIQUE(ev_event,ev_run,ev_tick)
325        ) TYPE=InnoDB''')
326
327        # COLUMNS:
328        #   'id' is the unique description id
329        #   'name' is the name of the event that occurred
330        #
331        # INDEXES:
332        #   'id' is indexed because it is the primary key and is what you use
333        #       to look up the descriptions
334        #   'name' is indexed so one can find the event based on name
335        #
336        self.query('''
337        CREATE TABLE event_names(
338            en_id	SMALLINT UNSIGNED	NOT NULL AUTO_INCREMENT,
339            en_name	VARCHAR(255)		NOT NULL,
340            PRIMARY KEY (en_id),
341            UNIQUE (en_name)
342        ) TYPE=InnoDB''')
343
344    def clean(self):
345        self.query('''
346        DELETE data
347        FROM data
348        LEFT JOIN runs ON dt_run=rn_id
349        WHERE rn_id IS NULL''')
350
351        self.query('''
352        DELETE formula_ref
353        FROM formula_ref
354        LEFT JOIN runs ON fr_run=rn_id
355        WHERE rn_id IS NULL''')
356
357        self.query('''
358        DELETE formulas
359        FROM formulas
360        LEFT JOIN formula_ref ON fm_stat=fr_stat
361        WHERE fr_stat IS NULL''')
362
363        self.query('''
364        DELETE stats
365        FROM stats
366        LEFT JOIN data ON st_id=dt_stat
367        WHERE dt_stat IS NULL''')
368
369        self.query('''
370        DELETE subdata
371        FROM subdata
372        LEFT JOIN data ON sd_stat=dt_stat
373        WHERE dt_stat IS NULL''')
374
375        self.query('''
376        DELETE events
377        FROM events
378        LEFT JOIN runs ON ev_run=rn_id
379        WHERE rn_id IS NULL''')
380
381        self.query('''
382        DELETE event_names
383        FROM event_names
384        LEFT JOIN events ON en_id=ev_event
385        WHERE ev_event IS NULL''')
386