dbinit.py revision 1758:74acd5b23964
1
2# Copyright (c) 2003-2004 The Regents of The University of Michigan
3# All rights reserved.
4#
5# Redistribution and use in source and binary forms, with or without
6# modification, are permitted provided that the following conditions are
7# met: redistributions of source code must retain the above copyright
8# notice, this list of conditions and the following disclaimer;
9# redistributions in binary form must reproduce the above copyright
10# notice, this list of conditions and the following disclaimer in the
11# documentation and/or other materials provided with the distribution;
12# neither the name of the copyright holders nor the names of its
13# contributors may be used to endorse or promote products derived from
14# this software without specific prior written permission.
15#
16# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28#Permission is granted to use, copy, create derivative works and
29#redistribute this software and such derivative works for any purpose,
30#so long as the copyright notice above, this grant of permission, and
31#the disclaimer below appear in all copies made; and so long as the
32#name of The University of Michigan is not used in any advertising or
33#publicity pertaining to the use or distribution of this software
34#without specific, written prior authorization.
35#
36#THIS SOFTWARE IS PROVIDED AS IS, WITHOUT REPRESENTATION FROM THE
37#UNIVERSITY OF MICHIGAN AS TO ITS FITNESS FOR ANY PURPOSE, AND WITHOUT
38#WARRANTY BY THE UNIVERSITY OF MICHIGAN OF ANY KIND, EITHER EXPRESS OR
39#IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED WARRANTIES OF
40#MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE REGENTS OF
41#THE UNIVERSITY OF MICHIGAN SHALL NOT BE LIABLE FOR ANY DAMAGES,
42#INCLUDING DIRECT, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL
43#DAMAGES, WITH RESPECT TO ANY CLAIM ARISING OUT OF OR IN CONNECTION
44#WITH THE USE OF THE SOFTWARE, EVEN IF IT HAS BEEN OR IS HEREAFTER
45#ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
46
47import MySQLdb
48
49class MyDB(object):
50    def __init__(self, options):
51        self.name = options.db
52        self.host = options.host
53        self.user = options.user
54        self.passwd = options.passwd
55        self.mydb = None
56        self.cursor = None
57
58    def admin(self):
59        self.close()
60        self.mydb = MySQLdb.connect(db='mysql', host=self.host, user=self.user,
61                                    passwd=self.passwd)
62        self.cursor = self.mydb.cursor()
63
64    def connect(self):
65        self.close()
66        self.mydb = MySQLdb.connect(db=self.name, host=self.host,
67                                    user=self.user, passwd=self.passwd)
68        self.cursor = self.mydb.cursor()
69
70    def close(self):
71        if self.mydb is not None:
72            self.mydb.close()
73        self.cursor = None
74
75    def query(self, sql):
76        self.cursor.execute(sql)
77
78    def drop(self):
79        self.query('DROP DATABASE IF EXISTS %s' % self.name)
80
81    def create(self):
82        self.query('CREATE DATABASE %s' % self.name)
83
84    def populate(self):
85        #
86        # Each run (or simulation) gets its own entry in the runs table to
87        # group stats by where they were generated
88        #
89        # COLUMNS:
90        #   'id' is a unique identifier for each run to be used in other
91        #       tables.
92        #   'name' is the user designated name for the data generated.  It is
93        #       configured in the simulator.
94        #   'user' identifies the user that generated the data for the given
95        #       run.
96        #   'project' another name to identify runs for a specific goal
97        #   'date' is a timestamp for when the data was generated.  It can be
98        #       used to easily expire data that was generated in the past.
99        #   'expire' is a timestamp for when the data should be removed from
100        #       the database so we don't have years worth of junk.
101        #
102        # INDEXES:
103        #   'run' is indexed so you can find out details of a run if the run
104        #       was retreived from the data table.
105        #   'name' is indexed so that two all run names are forced to be unique
106        #
107        self.query('''
108        CREATE TABLE runs(
109            rn_id	SMALLINT UNSIGNED	NOT NULL AUTO_INCREMENT,
110            rn_name	VARCHAR(200)		NOT NULL,
111            rn_sample	VARCHAR(32)		NOT NULL,
112            rn_user	VARCHAR(32)		NOT NULL,
113            rn_project	VARCHAR(100)            NOT NULL,
114            rn_date	TIMESTAMP		NOT NULL,
115            rn_expire	TIMESTAMP               NOT NULL,
116            PRIMARY KEY (rn_id),
117            UNIQUE (rn_name,rn_sample)
118        ) TYPE=InnoDB''')
119
120        #
121        # We keep the bin names separate so that the data table doesn't get
122        # huge since bin names are frequently repeated.
123        #
124        # COLUMNS:
125        #   'id' is the unique bin identifer.
126        #   'name' is the string name for the bin.
127        #
128        # INDEXES:
129        #   'bin' is indexed to get the name of a bin when data is retrieved
130        #       via the data table.
131        #   'name' is indexed to get the bin id for a named bin when you want
132        #       to search the data table based on a specific bin.
133        #
134        self.query('''
135        CREATE TABLE bins(
136            bn_id	SMALLINT UNSIGNED	NOT NULL AUTO_INCREMENT,
137            bn_name	VARCHAR(255)		NOT NULL,
138            PRIMARY KEY(bn_id),
139            UNIQUE (bn_name)
140        ) TYPE=InnoDB''')
141
142        #
143        # The stat table gives us all of the data for a particular stat.
144        #
145        # COLUMNS:
146        #   'stat' is a unique identifier for each stat to be used in other
147        #       tables for references.
148        #   'name' is simply the simulator derived name for a given
149        #       statistic.
150        #   'descr' is the description of the statistic and what it tells
151        #       you.
152        #   'type' defines what the stat tells you.  Types are:
153        #       SCALAR: A simple scalar statistic that holds one value
154        #       VECTOR: An array of statistic values.  Such a something that
155        #           is generated per-thread.  Vectors exist to give averages,
156        #	     pdfs, cdfs, means, standard deviations, etc across the
157        #           stat values.
158        #       DIST: Is a distribution of data.  When the statistic value is
159        #	     sampled, its value is counted in a particular bucket.
160        #           Useful for keeping track of utilization of a resource.
161        #           (e.g. fraction of time it is 25% used vs. 50% vs. 100%)
162        #       VECTORDIST: Can be used when the distribution needs to be
163        #	     factored out into a per-thread distribution of data for
164        #	     example.  It can still be summed across threads to find
165        #           the total distribution.
166        #       VECTOR2D: Can be used when you have a stat that is not only
167        #           per-thread, but it is per-something else.  Like
168        #           per-message type.
169        #       FORMULA: This statistic is a formula, and its data must be
170        #	     looked up in the formula table, for indicating how to
171        #           present its values.
172        #   'subdata' is potentially used by any of the vector types to
173        #       give a specific name to all of the data elements within a
174        #       stat.
175        #   'print' indicates whether this stat should be printed ever.
176        #       (Unnamed stats don't usually get printed)
177        #   'prereq' only print the stat if the prereq is not zero.
178        #   'prec' number of decimal places to print
179        #   'nozero' don't print zero values
180        #   'nonan' don't print NaN values
181        #   'total' for vector type stats, print the total.
182        #   'pdf' for vector type stats, print the pdf.
183        #   'cdf' for vector type stats, print the cdf.
184        #
185        #   The Following are for dist type stats:
186        #   'min' is the minimum bucket value. Anything less is an underflow.
187        #   'max' is the maximum bucket value. Anything more is an overflow.
188        #   'bktsize' is the approximate number of entries in each bucket.
189        #   'size' is the number of buckets. equal to (min/max)/bktsize.
190        #
191        # INDEXES:
192        #   'stat' is indexed so that you can find out details about a stat
193        #       if the stat id was retrieved from the data table.
194        #   'name' is indexed so that you can simply look up data about a
195        #       named stat.
196        #
197        self.query('''
198        CREATE TABLE stats(
199            st_id	SMALLINT UNSIGNED	NOT NULL AUTO_INCREMENT,
200            st_name	VARCHAR(255)		NOT NULL,
201            st_descr	TEXT			NOT NULL,
202            st_type	ENUM("SCALAR", "VECTOR", "DIST", "VECTORDIST",
203                "VECTOR2D", "FORMULA")	NOT NULL,
204            st_print	BOOL			NOT NULL,
205            st_prereq	SMALLINT UNSIGNED	NOT NULL,
206            st_prec	TINYINT			NOT NULL,
207            st_nozero	BOOL			NOT NULL,
208            st_nonan	BOOL			NOT NULL,
209            st_total	BOOL			NOT NULL,
210            st_pdf	BOOL			NOT NULL,
211            st_cdf	BOOL			NOT NULL,
212            st_min	DOUBLE			NOT NULL,
213            st_max	DOUBLE			NOT NULL,
214            st_bktsize	DOUBLE			NOT NULL,
215            st_size	SMALLINT UNSIGNED	NOT NULL,
216            PRIMARY KEY (st_id),
217            UNIQUE (st_name)
218        ) TYPE=InnoDB''')
219
220        #
221        # This is the main table of data for stats.
222        #
223        # COLUMNS:
224        #   'stat' refers to the stat field given in the stat table.
225        #
226        #   'x' referrs to the first dimension of a multi-dimensional stat. For
227        #       a vector, x will start at 0 and increase for each vector
228        #       element.
229        #       For a distribution:
230        #       -1: sum (for calculating standard deviation)
231        #       -2: sum of squares (for calculating standard deviation)
232        #       -3: total number of samples taken (for calculating
233        #           standard deviation)
234        #       -4: minimum value
235        #       -5: maximum value
236        #       -6: underflow
237        #       -7: overflow
238        #   'y' is used by a VECTORDIST and the VECTOR2D to describe the second
239        #       dimension.
240        #   'run' is the run that the data was generated from.  Details up in
241        #       the run table
242        #   'tick' is a timestamp generated by the simulator.
243        #   'bin' is the name of the bin that the data was generated in, if
244        #       any.
245        #   'data' is the actual stat value.
246        #
247        # INDEXES:
248        #   'stat' is indexed so that a user can find all of the data for a
249        #       particular stat. It is not unique, because that specific stat
250        #       can be found in many runs, bins, and samples, in addition to
251        #       having entries for the mulidimensional cases.
252        #   'run' is indexed to allow a user to remove all of the data for a
253        #       particular execution run.  It can also be used to allow the
254        #       user to print out all of the data for a given run.
255        #
256        self.query('''
257        CREATE TABLE data(
258            dt_stat	SMALLINT UNSIGNED	NOT NULL,
259            dt_x	SMALLINT		NOT NULL,
260            dt_y	SMALLINT		NOT NULL,
261            dt_run	SMALLINT UNSIGNED	NOT NULL,
262            dt_tick	BIGINT UNSIGNED		NOT NULL,
263            dt_bin	SMALLINT UNSIGNED	NOT NULL,
264            dt_data	DOUBLE			NOT NULL,
265            INDEX (dt_stat),
266            INDEX (dt_run),
267            UNIQUE (dt_stat,dt_x,dt_y,dt_run,dt_tick,dt_bin)
268        ) TYPE=InnoDB;''')
269
270        #
271        # Names and descriptions for multi-dimensional stats (vectors, etc.)
272        # are stored here instead of having their own entry in the statistics
273        # table. This allows all parts of a single stat to easily share a
274        # single id.
275        #
276        # COLUMNS:
277        #   'stat' is the unique stat identifier from the stat table.
278        #   'x' is the first dimension for multi-dimensional stats
279        #       corresponding to the data table above.
280        #   'y' is the second dimension for multi-dimensional stats
281        #       corresponding to the data table above.
282        #   'name' is the specific subname for the unique stat,x,y combination.
283        #   'descr' is the specific description for the uniqe stat,x,y
284        #        combination.
285        #
286        # INDEXES:
287        #   'stat' is indexed so you can get the subdata for a specific stat.
288        #
289        self.query('''
290        CREATE TABLE subdata(
291            sd_stat	SMALLINT UNSIGNED	NOT NULL,
292            sd_x	SMALLINT		NOT NULL,
293            sd_y	SMALLINT		NOT NULL,
294            sd_name	VARCHAR(255)		NOT NULL,
295            sd_descr	TEXT,
296            UNIQUE (sd_stat,sd_x,sd_y)
297        ) TYPE=InnoDB''')
298
299
300        #
301        # The formula table is maintained separately from the data table
302        # because formula data, unlike other stat data cannot be represented
303        # there.
304        #
305        # COLUMNS:
306        #   'stat' refers to the stat field generated in the stat table.
307        #   'formula' is the actual string representation of the formula
308        #       itself.
309        #
310        # INDEXES:
311        #   'stat' is indexed so that you can just look up a formula.
312        #
313        self.query('''
314        CREATE TABLE formulas(
315            fm_stat	SMALLINT UNSIGNED	NOT NULL,
316            fm_formula	BLOB			NOT NULL,
317            PRIMARY KEY(fm_stat)
318        ) TYPE=InnoDB''')
319
320        #
321        # Each stat used in each formula is kept in this table.  This way, if
322        # you want to print out a particular formula, you can simply find out
323        # which stats you need by looking in this table.  Additionally, when
324        # you remove a stat from the stats table and data table, you remove
325        # any references to the formula in this table.  When a formula is no
326        # longer referred to, you remove its entry.
327        #
328        # COLUMNS:
329        #   'stat' is the stat id from the stat table above.
330        #   'child' is the stat id of a stat that is used for this formula.
331        #       There may be many children for any given 'stat' (formula)
332        #
333        # INDEXES:
334        #   'stat' is indexed so you can look up all of the children for a
335        #       particular stat.
336        #   'child' is indexed so that you can remove an entry when a stat is
337        #       removed.
338        #
339        self.query('''
340        CREATE TABLE formula_ref(
341            fr_stat	SMALLINT UNSIGNED	NOT NULL,
342            fr_run	SMALLINT UNSIGNED	NOT NULL,
343            UNIQUE (fr_stat,fr_run),
344            INDEX (fr_stat),
345            INDEX (fr_run)
346        ) TYPE=InnoDB''')
347
348        # COLUMNS:
349        #   'event' is the unique event id from the event_desc table
350        #   'run' is simulation run id that this event took place in
351        #   'tick' is the tick when the event happened
352        #
353        # INDEXES:
354        #   'event' is indexed so you can look up all occurences of a
355        #       specific event
356        #   'run' is indexed so you can find all events in a run
357        #   'tick' is indexed because we want the unique thing anyway
358        #   'event,run,tick' is unique combination
359        self.query('''
360        CREATE TABLE events(
361            ev_event	SMALLINT UNSIGNED	NOT NULL,
362            ev_run	SMALLINT UNSIGNED	NOT NULL,
363            ev_tick	BIGINT   UNSIGNED       NOT NULL,
364            INDEX(ev_event),
365            INDEX(ev_run),
366            INDEX(ev_tick),
367            UNIQUE(ev_event,ev_run,ev_tick)
368        ) TYPE=InnoDB''')
369
370        # COLUMNS:
371        #   'id' is the unique description id
372        #   'name' is the name of the event that occurred
373        #
374        # INDEXES:
375        #   'id' is indexed because it is the primary key and is what you use
376        #       to look up the descriptions
377        #   'name' is indexed so one can find the event based on name
378        #
379        self.query('''
380        CREATE TABLE event_names(
381            en_id	SMALLINT UNSIGNED	NOT NULL AUTO_INCREMENT,
382            en_name	VARCHAR(255)		NOT NULL,
383            PRIMARY KEY (en_id),
384            UNIQUE (en_name)
385        ) TYPE=InnoDB''')
386
387    def clean(self):
388        self.query('''
389        DELETE data
390        FROM data
391        LEFT JOIN runs ON dt_run=rn_id
392        WHERE rn_id IS NULL''')
393
394        self.query('''
395        DELETE formula_ref
396        FROM formula_ref
397        LEFT JOIN runs ON fr_run=rn_id
398        WHERE rn_id IS NULL''')
399
400        self.query('''
401        DELETE formulas
402        FROM formulas
403        LEFT JOIN formula_ref ON fm_stat=fr_stat
404        WHERE fr_stat IS NULL''')
405
406        self.query('''
407        DELETE stats
408        FROM stats
409        LEFT JOIN data ON st_id=dt_stat
410        WHERE dt_stat IS NULL''')
411
412        self.query('''
413        DELETE subdata
414        FROM subdata
415        LEFT JOIN data ON sd_stat=dt_stat
416        WHERE dt_stat IS NULL''')
417
418        self.query('''
419        DELETE bins
420        FROM bins
421        LEFT JOIN data ON bn_id=dt_bin
422        WHERE dt_bin IS NULL''')
423
424        self.query('''
425        DELETE events
426        FROM events
427        LEFT JOIN runs ON ev_run=rn_id
428        WHERE rn_id IS NULL''')
429
430        self.query('''
431        DELETE event_names
432        FROM event_names
433        LEFT JOIN events ON en_id=ev_event
434        WHERE ev_event IS NULL''')
435