dbinit.py revision 1772:a3a83e812a5e
1# Copyright (c) 2003-2004 The Regents of The University of Michigan
2# All rights reserved.
3#
4# Redistribution and use in source and binary forms, with or without
5# modification, are permitted provided that the following conditions are
6# met: redistributions of source code must retain the above copyright
7# notice, this list of conditions and the following disclaimer;
8# redistributions in binary form must reproduce the above copyright
9# notice, this list of conditions and the following disclaimer in the
10# documentation and/or other materials provided with the distribution;
11# neither the name of the copyright holders nor the names of its
12# contributors may be used to endorse or promote products derived from
13# this software without specific prior written permission.
14#
15# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
27import MySQLdb
28
29class MyDB(object):
30    def __init__(self, options):
31        self.name = options.db
32        self.host = options.host
33        self.user = options.user
34        self.passwd = options.passwd
35        self.mydb = None
36        self.cursor = None
37
38    def admin(self):
39        self.close()
40        self.mydb = MySQLdb.connect(db='mysql', host=self.host, user=self.user,
41                                    passwd=self.passwd)
42        self.cursor = self.mydb.cursor()
43
44    def connect(self):
45        self.close()
46        self.mydb = MySQLdb.connect(db=self.name, host=self.host,
47                                    user=self.user, passwd=self.passwd)
48        self.cursor = self.mydb.cursor()
49
50    def close(self):
51        if self.mydb is not None:
52            self.mydb.close()
53        self.cursor = None
54
55    def query(self, sql):
56        self.cursor.execute(sql)
57
58    def drop(self):
59        self.query('DROP DATABASE IF EXISTS %s' % self.name)
60
61    def create(self):
62        self.query('CREATE DATABASE %s' % self.name)
63
64    def populate(self):
65        #
66        # Each run (or simulation) gets its own entry in the runs table to
67        # group stats by where they were generated
68        #
69        # COLUMNS:
70        #   'id' is a unique identifier for each run to be used in other
71        #       tables.
72        #   'name' is the user designated name for the data generated.  It is
73        #       configured in the simulator.
74        #   'user' identifies the user that generated the data for the given
75        #       run.
76        #   'project' another name to identify runs for a specific goal
77        #   'date' is a timestamp for when the data was generated.  It can be
78        #       used to easily expire data that was generated in the past.
79        #   'expire' is a timestamp for when the data should be removed from
80        #       the database so we don't have years worth of junk.
81        #
82        # INDEXES:
83        #   'run' is indexed so you can find out details of a run if the run
84        #       was retreived from the data table.
85        #   'name' is indexed so that two all run names are forced to be unique
86        #
87        self.query('''
88        CREATE TABLE runs(
89            rn_id	SMALLINT UNSIGNED	NOT NULL AUTO_INCREMENT,
90            rn_name	VARCHAR(200)		NOT NULL,
91            rn_sample	VARCHAR(32)		NOT NULL,
92            rn_user	VARCHAR(32)		NOT NULL,
93            rn_project	VARCHAR(100)            NOT NULL,
94            rn_date	TIMESTAMP		NOT NULL,
95            rn_expire	TIMESTAMP               NOT NULL,
96            PRIMARY KEY (rn_id),
97            UNIQUE (rn_name,rn_sample)
98        ) TYPE=InnoDB''')
99
100        #
101        # We keep the bin names separate so that the data table doesn't get
102        # huge since bin names are frequently repeated.
103        #
104        # COLUMNS:
105        #   'id' is the unique bin identifer.
106        #   'name' is the string name for the bin.
107        #
108        # INDEXES:
109        #   'bin' is indexed to get the name of a bin when data is retrieved
110        #       via the data table.
111        #   'name' is indexed to get the bin id for a named bin when you want
112        #       to search the data table based on a specific bin.
113        #
114        self.query('''
115        CREATE TABLE bins(
116            bn_id	SMALLINT UNSIGNED	NOT NULL AUTO_INCREMENT,
117            bn_name	VARCHAR(255)		NOT NULL,
118            PRIMARY KEY(bn_id),
119            UNIQUE (bn_name)
120        ) TYPE=InnoDB''')
121
122        #
123        # The stat table gives us all of the data for a particular stat.
124        #
125        # COLUMNS:
126        #   'stat' is a unique identifier for each stat to be used in other
127        #       tables for references.
128        #   'name' is simply the simulator derived name for a given
129        #       statistic.
130        #   'descr' is the description of the statistic and what it tells
131        #       you.
132        #   'type' defines what the stat tells you.  Types are:
133        #       SCALAR: A simple scalar statistic that holds one value
134        #       VECTOR: An array of statistic values.  Such a something that
135        #           is generated per-thread.  Vectors exist to give averages,
136        #	     pdfs, cdfs, means, standard deviations, etc across the
137        #           stat values.
138        #       DIST: Is a distribution of data.  When the statistic value is
139        #	     sampled, its value is counted in a particular bucket.
140        #           Useful for keeping track of utilization of a resource.
141        #           (e.g. fraction of time it is 25% used vs. 50% vs. 100%)
142        #       VECTORDIST: Can be used when the distribution needs to be
143        #	     factored out into a per-thread distribution of data for
144        #	     example.  It can still be summed across threads to find
145        #           the total distribution.
146        #       VECTOR2D: Can be used when you have a stat that is not only
147        #           per-thread, but it is per-something else.  Like
148        #           per-message type.
149        #       FORMULA: This statistic is a formula, and its data must be
150        #	     looked up in the formula table, for indicating how to
151        #           present its values.
152        #   'subdata' is potentially used by any of the vector types to
153        #       give a specific name to all of the data elements within a
154        #       stat.
155        #   'print' indicates whether this stat should be printed ever.
156        #       (Unnamed stats don't usually get printed)
157        #   'prereq' only print the stat if the prereq is not zero.
158        #   'prec' number of decimal places to print
159        #   'nozero' don't print zero values
160        #   'nonan' don't print NaN values
161        #   'total' for vector type stats, print the total.
162        #   'pdf' for vector type stats, print the pdf.
163        #   'cdf' for vector type stats, print the cdf.
164        #
165        #   The Following are for dist type stats:
166        #   'min' is the minimum bucket value. Anything less is an underflow.
167        #   'max' is the maximum bucket value. Anything more is an overflow.
168        #   'bktsize' is the approximate number of entries in each bucket.
169        #   'size' is the number of buckets. equal to (min/max)/bktsize.
170        #
171        # INDEXES:
172        #   'stat' is indexed so that you can find out details about a stat
173        #       if the stat id was retrieved from the data table.
174        #   'name' is indexed so that you can simply look up data about a
175        #       named stat.
176        #
177        self.query('''
178        CREATE TABLE stats(
179            st_id	SMALLINT UNSIGNED	NOT NULL AUTO_INCREMENT,
180            st_name	VARCHAR(255)		NOT NULL,
181            st_descr	TEXT			NOT NULL,
182            st_type	ENUM("SCALAR", "VECTOR", "DIST", "VECTORDIST",
183                "VECTOR2D", "FORMULA")	NOT NULL,
184            st_print	BOOL			NOT NULL,
185            st_prereq	SMALLINT UNSIGNED	NOT NULL,
186            st_prec	TINYINT			NOT NULL,
187            st_nozero	BOOL			NOT NULL,
188            st_nonan	BOOL			NOT NULL,
189            st_total	BOOL			NOT NULL,
190            st_pdf	BOOL			NOT NULL,
191            st_cdf	BOOL			NOT NULL,
192            st_min	DOUBLE			NOT NULL,
193            st_max	DOUBLE			NOT NULL,
194            st_bktsize	DOUBLE			NOT NULL,
195            st_size	SMALLINT UNSIGNED	NOT NULL,
196            PRIMARY KEY (st_id),
197            UNIQUE (st_name)
198        ) TYPE=InnoDB''')
199
200        #
201        # This is the main table of data for stats.
202        #
203        # COLUMNS:
204        #   'stat' refers to the stat field given in the stat table.
205        #
206        #   'x' referrs to the first dimension of a multi-dimensional stat. For
207        #       a vector, x will start at 0 and increase for each vector
208        #       element.
209        #       For a distribution:
210        #       -1: sum (for calculating standard deviation)
211        #       -2: sum of squares (for calculating standard deviation)
212        #       -3: total number of samples taken (for calculating
213        #           standard deviation)
214        #       -4: minimum value
215        #       -5: maximum value
216        #       -6: underflow
217        #       -7: overflow
218        #   'y' is used by a VECTORDIST and the VECTOR2D to describe the second
219        #       dimension.
220        #   'run' is the run that the data was generated from.  Details up in
221        #       the run table
222        #   'tick' is a timestamp generated by the simulator.
223        #   'bin' is the name of the bin that the data was generated in, if
224        #       any.
225        #   'data' is the actual stat value.
226        #
227        # INDEXES:
228        #   'stat' is indexed so that a user can find all of the data for a
229        #       particular stat. It is not unique, because that specific stat
230        #       can be found in many runs, bins, and samples, in addition to
231        #       having entries for the mulidimensional cases.
232        #   'run' is indexed to allow a user to remove all of the data for a
233        #       particular execution run.  It can also be used to allow the
234        #       user to print out all of the data for a given run.
235        #
236        self.query('''
237        CREATE TABLE data(
238            dt_stat	SMALLINT UNSIGNED	NOT NULL,
239            dt_x	SMALLINT		NOT NULL,
240            dt_y	SMALLINT		NOT NULL,
241            dt_run	SMALLINT UNSIGNED	NOT NULL,
242            dt_tick	BIGINT UNSIGNED		NOT NULL,
243            dt_bin	SMALLINT UNSIGNED	NOT NULL,
244            dt_data	DOUBLE			NOT NULL,
245            INDEX (dt_stat),
246            INDEX (dt_run),
247            UNIQUE (dt_stat,dt_x,dt_y,dt_run,dt_tick,dt_bin)
248        ) TYPE=InnoDB;''')
249
250        #
251        # Names and descriptions for multi-dimensional stats (vectors, etc.)
252        # are stored here instead of having their own entry in the statistics
253        # table. This allows all parts of a single stat to easily share a
254        # single id.
255        #
256        # COLUMNS:
257        #   'stat' is the unique stat identifier from the stat table.
258        #   'x' is the first dimension for multi-dimensional stats
259        #       corresponding to the data table above.
260        #   'y' is the second dimension for multi-dimensional stats
261        #       corresponding to the data table above.
262        #   'name' is the specific subname for the unique stat,x,y combination.
263        #   'descr' is the specific description for the uniqe stat,x,y
264        #        combination.
265        #
266        # INDEXES:
267        #   'stat' is indexed so you can get the subdata for a specific stat.
268        #
269        self.query('''
270        CREATE TABLE subdata(
271            sd_stat	SMALLINT UNSIGNED	NOT NULL,
272            sd_x	SMALLINT		NOT NULL,
273            sd_y	SMALLINT		NOT NULL,
274            sd_name	VARCHAR(255)		NOT NULL,
275            sd_descr	TEXT,
276            UNIQUE (sd_stat,sd_x,sd_y)
277        ) TYPE=InnoDB''')
278
279
280        #
281        # The formula table is maintained separately from the data table
282        # because formula data, unlike other stat data cannot be represented
283        # there.
284        #
285        # COLUMNS:
286        #   'stat' refers to the stat field generated in the stat table.
287        #   'formula' is the actual string representation of the formula
288        #       itself.
289        #
290        # INDEXES:
291        #   'stat' is indexed so that you can just look up a formula.
292        #
293        self.query('''
294        CREATE TABLE formulas(
295            fm_stat	SMALLINT UNSIGNED	NOT NULL,
296            fm_formula	BLOB			NOT NULL,
297            PRIMARY KEY(fm_stat)
298        ) TYPE=InnoDB''')
299
300        #
301        # Each stat used in each formula is kept in this table.  This way, if
302        # you want to print out a particular formula, you can simply find out
303        # which stats you need by looking in this table.  Additionally, when
304        # you remove a stat from the stats table and data table, you remove
305        # any references to the formula in this table.  When a formula is no
306        # longer referred to, you remove its entry.
307        #
308        # COLUMNS:
309        #   'stat' is the stat id from the stat table above.
310        #   'child' is the stat id of a stat that is used for this formula.
311        #       There may be many children for any given 'stat' (formula)
312        #
313        # INDEXES:
314        #   'stat' is indexed so you can look up all of the children for a
315        #       particular stat.
316        #   'child' is indexed so that you can remove an entry when a stat is
317        #       removed.
318        #
319        self.query('''
320        CREATE TABLE formula_ref(
321            fr_stat	SMALLINT UNSIGNED	NOT NULL,
322            fr_run	SMALLINT UNSIGNED	NOT NULL,
323            UNIQUE (fr_stat,fr_run),
324            INDEX (fr_stat),
325            INDEX (fr_run)
326        ) TYPE=InnoDB''')
327
328        # COLUMNS:
329        #   'event' is the unique event id from the event_desc table
330        #   'run' is simulation run id that this event took place in
331        #   'tick' is the tick when the event happened
332        #
333        # INDEXES:
334        #   'event' is indexed so you can look up all occurences of a
335        #       specific event
336        #   'run' is indexed so you can find all events in a run
337        #   'tick' is indexed because we want the unique thing anyway
338        #   'event,run,tick' is unique combination
339        self.query('''
340        CREATE TABLE events(
341            ev_event	SMALLINT UNSIGNED	NOT NULL,
342            ev_run	SMALLINT UNSIGNED	NOT NULL,
343            ev_tick	BIGINT   UNSIGNED       NOT NULL,
344            INDEX(ev_event),
345            INDEX(ev_run),
346            INDEX(ev_tick),
347            UNIQUE(ev_event,ev_run,ev_tick)
348        ) TYPE=InnoDB''')
349
350        # COLUMNS:
351        #   'id' is the unique description id
352        #   'name' is the name of the event that occurred
353        #
354        # INDEXES:
355        #   'id' is indexed because it is the primary key and is what you use
356        #       to look up the descriptions
357        #   'name' is indexed so one can find the event based on name
358        #
359        self.query('''
360        CREATE TABLE event_names(
361            en_id	SMALLINT UNSIGNED	NOT NULL AUTO_INCREMENT,
362            en_name	VARCHAR(255)		NOT NULL,
363            PRIMARY KEY (en_id),
364            UNIQUE (en_name)
365        ) TYPE=InnoDB''')
366
367    def clean(self):
368        self.query('''
369        DELETE data
370        FROM data
371        LEFT JOIN runs ON dt_run=rn_id
372        WHERE rn_id IS NULL''')
373
374        self.query('''
375        DELETE formula_ref
376        FROM formula_ref
377        LEFT JOIN runs ON fr_run=rn_id
378        WHERE rn_id IS NULL''')
379
380        self.query('''
381        DELETE formulas
382        FROM formulas
383        LEFT JOIN formula_ref ON fm_stat=fr_stat
384        WHERE fr_stat IS NULL''')
385
386        self.query('''
387        DELETE stats
388        FROM stats
389        LEFT JOIN data ON st_id=dt_stat
390        WHERE dt_stat IS NULL''')
391
392        self.query('''
393        DELETE subdata
394        FROM subdata
395        LEFT JOIN data ON sd_stat=dt_stat
396        WHERE dt_stat IS NULL''')
397
398        self.query('''
399        DELETE bins
400        FROM bins
401        LEFT JOIN data ON bn_id=dt_bin
402        WHERE dt_bin IS NULL''')
403
404        self.query('''
405        DELETE events
406        FROM events
407        LEFT JOIN runs ON ev_run=rn_id
408        WHERE rn_id IS NULL''')
409
410        self.query('''
411        DELETE event_names
412        FROM event_names
413        LEFT JOIN events ON en_id=ev_event
414        WHERE ev_event IS NULL''')
415