% Data Mining Bibliographies Copyright Information
%
% The author reserves the
%
% Copyright (C) 1997 Andy Pryke. All rights reserved.
%
% for the compilation of this KDD bibliography collection.
%
% If you find the bibliography collection useful for your work, I would
% be happy if you acknowledge it and me. You could also send me a
% postcard if you wish (address below).
%
% I usually give my consent that the collection may be copied and
% distributed with the following conditions:
%
% 1) It may be used only for research or educational purposes
%
% and
%
% 2) Any copy must be accompanied by a reference to the original
% collection and its author.
%
% and
%
% 3) This information must always accompany every copy of a bibliograhy.
%
% I reserve the right to revoke the above permission at any time.
%
% Any other use must be negotiated in advance.
%
% Any commercial use of the bibliographies is strictly prohibited. In
% particular, the whole or derived bibliographies may not be sold for
% profit or included in commercial documents (e.g., published on CD-ROM,
% floppy disks, books, magazines, or other print form) without the prior
% written permission of the copyright holder.
%
% Please contact the author if the intended usage is not covered by the
% above statement.
%
% Abstracts of publications published by the ACM and the IEEE are also
% subject to the respective "interim" or "provisional" copyright
% policies:
%
% ACM copyright policy (http://www.acm.org/pubs/copyright_policy/)
% IEEE copyright policy (http://www.ieee.org/copyright/policies.htm)
%
% This copyright notice is derived from one by Alf-Christian Achilles
% for his (massive) Computer Science Bibliography Collection at
% (http://liinwww.ira.uka.de/bibliography/index.html).
%
% --------------------------------------------------------------------
%
% My address:
%
% My postal address is:
%
% Andy Pryke,
% Department of Computer Science,
% The University of Birmingham,
% Edgbaston,
% Birmingham.
% B15 2TT
%
% Fax : 0121 414 4281
% Phone: 0121 414 3736
% Email: A.N.Pryke(at)cs.bham.ac.uk
% Web: http://www.cs.bham.ac.uk/~anp/
%
,
@Article{machine_learning_journal_special:93,
key = "Machine_Learning_Journal_Special:93",
journal = "Machine Learning Journal",
year = "1993",
volume = "5",
number = "6",
month = dec,
note = "Special issue on Learning and Discovery in Databases",
}
Improved Methods for Finding Association Rules,
Available as
compressed postscript.
@TechReport{no_author:improved-methods:,
URL = "ftp://ftp.cs.helsinki.fi/pub/Reports/by_Project/PMDM/Improved_Methods_for_Finding_Association_Rules.ps.gz",
title = "Improved Methods for Finding Association Rules",
abstract = "Association rules are statements of the form for 90 %
of the rows of the relation, if the row has value 1 in
the columns in set W , then it has 1 also in column B .
Agrawal, Imielinski, and Swami introduced the problem
of mining association rules from large collections of
data, and gave a method based on successive passes over
the database. We give an improved algorithm for the
problem. The method is based on careful combinatorial
analysis of the information obtained in previous
passes; this makes it possible to eliminate unnecessary
candidate rules. Experiments on a university course
enrollment database indicate that the method
outperforms the previous one by a factor of 5. We also
give simple information-theoretic lower bounds for the
problem of finding association rules, and show that
sampling is in general a very efficient way of finding
such rules. Computing Reviews Categories and Subject
Descriptors: H.3.3[Information Systems]: Information
Storage and Retrieval - Information Search and
Retrieval I.2.6 [Computing Methodologies]: Artificial
Intelligence - Learning I.2.8 [Computing
Methodologies]: Artificial Intelligence - Problem
Solving, Control Methods, and Search General Terms:
Databases, machine learning, artificial intelligence.
Additional Key Words and Phrases: Database mining,
knowledge discovery in databases, association rules,
covering sets.",
}
Learning Decision Trees for Mapping the Local Environment in Mobile Robot Navigation,
Available as
compressed postscript.
@TechReport{no_author:learning-decision:,
URL = "ftp://ftp.cs.helsinki.fi/pub/Reports/by_Project/PMDM/Learning_Decision_Trees_for_Mapping_the_Local_Environment_in_Mobile_Robot_Navigation.ps.gz",
title = "Learning Decision Trees for Mapping the Local
Environment in Mobile Robot Navigation",
abstract = "This paper describes the use of the C4.5 decision tree
learning algorithm in the design of a classifier for a
new approach to the mapping of a mobile robot's local
environment. The decision tree uses the features from
the echoes of an ultrasonic array mounted on the robot
to classify the contours of its local environment. The
contours are classified into a finite number of two
dimensional shapes to form a primitive map which is to
be used for navigation. The nature of the problem,
noise and the practical timing constraints,
distinguishes it from those typically used in machine
learning applications and highlights some of the
advantages of decision tree learning in robotic
applications.",
}
Overheads for the AI'94 Tutorial on Intelligent Learning Database Systems,
Available as
postscript.
@Misc{no_author:overheads-ai94:,
URL = "ftp://coral.cs.jcu.edu.au/pub/research/HCV/KDD.ps",
title = "Overheads for the {AI}'94 Tutorial on Intelligent
Learning Database Systems",
abstract = "This full-day tutorial presents and discusses
techniques for the following 3 interconnected phases in
constructing intelligent learning database systems: (1)
Translation of standard database information into a
form suitable for use by a rule-based system; (2) Using
machine learning techniques to produce rule bases from
databases; and (3) Interpreting the rules produced to
solve users' problems and/or reduce data spaces. It
suits a wide audience including postgraduate students
and industrial people from databases, expert systems,
and machine learning.",
annote = "Comments and suggestions for improvements are
solicited! Comments to Xindong Wu
(xindong(at)INSECT.SD.MONASH.EDU.AU),",
}
State Of The Art,
Available as
sec8.htm.
@Article{no_author:state-art:95,
title = "State Of The Art",
journal = "Byte",
year = "1995",
month = oct,
annote = "A number of articles, good introduction to data
mining",
URL = "http://www.byte.com/art/9510/sec8/sec8.htm",
}
Discovery of Actionable Patterns in Databases: The Action Hierarchy Approach, Gediminas Adomavicius and Alexander Tuzhilin
@InProceedings{adomavicius.ea:actionable-patterns:97,
title = "Discovery of Actionable Patterns in Databases: The
Action Hierarchy Approach",
author = "Gediminas Adomavicius and Alexander Tuzhilin",
pages = "111",
crossref = "heckerman.ea:proceedings-third:97",
}
Mining Association Rules between Sets of Items in Large Databases, Rakesh Agrawal and Tomasz Imielinski and Arun N. Swami
Available as
postscript.
@InProceedings{agrawal.ea:association-rules:93a,
key_modifier = "a",
title = "Mining Association Rules between Sets of Items in
Large Databases",
author = "Rakesh Agrawal and Tomasz Imielinski and Arun N.
Swami",
editor = "Peter Buneman and Sushil Jajodia",
booktitle = "Proceedings of the 1993 {ACM} {SIGMOD} International
Conference on Management of Data",
address = "Washington, D.C.",
month = "26--28~" # may,
year = "1993",
pages = "207--216",
URL = "http://www.almaden.ibm.com/cs/people/ragrawal/papers/sigmod93.ps",
abstract = "We are given a large database of customer
transactions. Each transaction consists of items
purchased by a customer in a visit. We present an
efficient algorithm that generates all significant
association rules between items in the database. The
algorithm incorporates buffer management and novel
estimation and pruning techniques. We also present
results of applying this algorithm to sales data
obtained from a large retailing company, which shows
the effectiveness of the algorithm.",
}
Mining association rules between sets of items in large databases, Rakesh Agrawal and Tomasz Imielinski and Arun Swami
@Article{agrawal.ea:association-rules:93b,
key_modifier = "b",
author = "Rakesh Agrawal and Tomasz Imielinski and Arun Swami",
title = "Mining association rules between sets of items in
large databases",
journal = "SIGMOD Record (ACM Special Interest Group on
Management of Data)",
volume = "22",
number = "2",
pages = "207--216",
month = jun,
year = "1993",
ISBN = "0-89791-592-5",
ISSN = "0163-5808",
abstract = "We are given a large database of customer
transactions. Each transaction consists of items
purchased by a customer in a visit. We present an
efficient algorithm that generates all significant
association rules between items in the database. The
algorithm incorporates buffer management and novel
estimation and pruning techniques. We also present
results of applying this algorithm to sales data
obtained from a large retailing company, which shows
the effectiveness of the algorithm.",
affiliation = "IBM Almaden Research Cent",
affiliationaddress = "San Jose, CA, USA",
classification = "723.3; 921.6; 911.4; 723.2; 722.1; 922.1; C6160Z
(Other DBMS); C6130 (Data handling techniques); C6170
(Expert systems); C6120 (File organisation); C7170
(Marketing)",
conference = "Proceedings of the 1993 ACM SIGMOD International
Conference on Management of Data",
conferenceyear = "1993",
keywords = "Database systems; Algorithms; Marketing; Data
handling; Data storage equipment; Probability;
Estimation; Query languages; Large scale systems;
Associative processing; Administrative data processing;
Large databases; Mining association rules; Pruning
technique; Basket data, Large database; Customer
transactions; Efficient algorithm; Association rules;
Buffer management; Novel estimation; Pruning
techniques; Sales data; Large retailing company",
meetingaddress = "Washington, DC, USA",
meetingdate = "May 26--28 1993",
meetingdate2 = "05/26--28/93",
publisherinfo = "Fort Collins Computer Center",
sponsor = "ACM, SIGMOD; Minerals, Metals \& Materials Society",
thesaurus = "Knowledge based systems; Marketing data processing;
Storage management; Transaction processing; Very large
databases",
xxcrossref = "Anonymous:1993:SAS",
}
Database mining - a performance perspective, R. Agrawal and T. Imielinski and A. Swami
@Article{agrawal.ea:database-performance:93,
author = "R. Agrawal and T. Imielinski and A. Swami",
address = "Ibm Corp, Almaden Res Ctr, 650 Harry Rd, San Jose, Ca,
95120",
title = "Database mining - a performance perspective",
journal = "Ieee Trans. On Knowledge And Data Engineering",
year = "1993",
volume = "5",
issue = "6",
pages = "914--925",
abstract = "We present our perspective of database mining as the
confluence of machine learning techniques and the
performance emphasis of database technology. We
describe three classes of database mining problems
involving classification, associations, and sequences,
and argue that these problems can be uniformly viewed
as requiring discovery of rules embedded in massive
data. We describe a model and some basic operations for
the process of rule discovery. We show how the database
mining problems we consider map to this model and how
they can be solved by using the basic operations we
propose. We give an example of an algorithm for
classification obtained by combining the basic rule
discovery operations. This algorithm not only is
efficient in discovering classification rules but also
has accuracy comparable to ID3, one of the current best
classifiers.",
annote = "Identification and unification of 3 classes of data
mining problem, Classification, Association and
Sequences. They then go on to propose a unifying
framework for these three problems, and five basic
operators for rule discovery. These are then used to
construct an algorithm CDP (Classifier with Dynamic
Pruning) which out performs ID3 in classifier accuracy
and efficiency on a test problem.",
keywords = "ASSOCIATIONS, CLASSIFICATION, DATABASE MINING,
DECISION TREES, KNOWLEDGE DISCOVERY, SEQUENCES",
}
Developing Tightly-Coupled Data Mining Applications on a Relational Database System, Rakesh Agrawal and Kyuseok Shim
@InProceedings{agrawal.ea:developing-tightly-coupled:96,
title = "Developing Tightly-Coupled Data Mining Applications on
a Relational Database System",
pages = "287",
author = "Rakesh Agrawal and Kyuseok Shim",
crossref = "simoudis.ea:proceedings-second:96",
}
Fast Algorithms for Mining Association Rules in Large Databases, R. Agrawal and R. Srikant
@InProceedings{agrawal.ea:fast-algorithms:94,
author = "R. Agrawal and R. Srikant",
title = "Fast Algorithms for Mining Association Rules in Large
Databases",
editor = "Jorgeesh Bocca and Matthias Jarke and Carlo Zaniolo",
booktitle = "20th International Conference on Very Large Data
Bases, September 12--15, 1994, Santiago, Chile
proceedings",
publisher = "Morgan Kaufmann Publishers",
address = "Los Altos, CA 94022, USA",
pages = "487--499",
year = "1994",
annote = "Also known as VLDB'94",
keywords = "very large data bases; VLDB",
}
Parallel mining of association rules, R. Agrawal and J. C. Shafer
@Article{agrawal.ea:parallel-association:96,
author = "R. Agrawal and J. C. Shafer",
address = "Ibm Corp, Almaden Res Ctr, 650 Harry Rd, San Jose, Ca,
95120",
title = "Parallel mining of association rules",
journal = "Ieee Trans. On Knowledge And Data Engineering",
year = "1996",
volume = "8",
issue = "6",
pages = "962--969",
abstract = "We consider the problem of mining association rules on
a shared- nothing multiprocessor. We present three
algorithms that explore a spectrum of trade-offs
between computation, communication, memory usage,
synchronization, and the use of problem-specific
information. The best algorithm exhibits near perfect
scaleup behavior, yet requires only minimal overhead
compared to the current best serial algorithm.",
keywords = "data mining, association rules, parallel algorithms",
}
Quest: A Project on Database Mining, R. Agrawal and M. Carey and C. Faloutson and S. Ghosh and A. Houtsma and T. Imielinski and B. Iyer and A. Mahboob and H. Miranda and R. Srikant and A. Swami
@Article{agrawal.ea:quest-project:94a,
key_modifier = "a",
author = "R. Agrawal and M. Carey and C. Faloutson and S. Ghosh
and A. Houtsma and T. Imielinski and B. Iyer and A.
Mahboob and H. Miranda and R. Srikant and A. Swami",
title = "{Quest}: {A} Project on Database Mining",
journal = "SIGMOD Record (ACM Special Interest Group on
Management of Data)",
volume = "23",
number = "2",
pages = "514--514",
month = jun,
year = "1994",
ISSN = "0163-5808",
affiliation = "IBM Almaden Res. Center, San Jose, CA, USA",
classification = "C6160 (Database management systems (DBMS))",
keywords = "Quest project; Database mining; Tertiary storage; Data
model construction; Data model verification",
thesaurus = "Very large databases",
xxcrossref = "Anonymous:1994:ASI",
}
Quest: A Project on Database Mining, Rakesh Agrawal and Michael J. Carey and Christos Faloutsos and Sakti P. Ghosh and Maurice A. W. Houtsma and Tomasz Imielinski and Balakrishna R. Iyer and A. Mahboob and H. Miranda and Ramakrishnan Srikant and Arun N. Swami
@InProceedings{agrawal.ea:quest-project:94b,
key_modifier = "b",
title = "Quest: {A} Project on Database Mining",
author = "Rakesh Agrawal and Michael J. Carey and Christos
Faloutsos and Sakti P. Ghosh and Maurice A. W. Houtsma
and Tomasz Imielinski and Balakrishna R. Iyer and A.
Mahboob and H. Miranda and Ramakrishnan Srikant and
Arun N. Swami",
editor = "Richard T. Snodgrass and Marianne Winslett",
booktitle = "Proceedings of the 1994 {ACM} {SIGMOD} International
Conference on Management of Data",
address = "Minneapolis, Minnesota",
month = "24--27~" # may,
year = "1994",
pages = "514",
}
The Quest Data Mining System, Rakesh Agrawal and Manish Mehta and John Shafer and Ramakrishnan Srikant and Andreas Arning and Toni Bollinger
@InProceedings{agrawal.ea:quest-system:96,
title = "The Quest Data Mining System",
pages = "244",
author = "Rakesh Agrawal and Manish Mehta and John Shafer and
Ramakrishnan Srikant and Andreas Arning and Toni
Bollinger",
crossref = "simoudis.ea:proceedings-second:96",
}
Mining Sequential Patterns, R. Agrawal and R. Srikant
@InProceedings{agrawal.ea:sequential-patterns:95,
author = "R. Agrawal and R. Srikant",
title = "Mining Sequential Patterns",
booktitle = "International Conference on Database Engineering",
organization = "ieee",
year = "1995",
pages = "3--14",
abstract = "We are given a large database of customer
transactions, where each transaction consists of
customer-id, transaction time, and the items bought in
the transaction. We introduce the problem of mining
sequential patterns over such databases. We present
three algorithms to solve this problem, and empirically
evaluate their performance using synthetic data. Two of
the proposed algorithms, AprioriSome and AprioriAll,
have comparable performance, albeit AprioriSome
performs a little better when the minimum number of
customers that must support a sequential pattern is
low. Scale-up experiments show that both AprioriSome
and AprioriAII scale linearly with the number of
customer transactions. They also have excellent
scale-up properties with respect to the number of
transactions per customer and the number of items in a
transaction.",
}
Data Mining, Rakesh Agrawal
@InProceedings{agrawal:data-mining:94,
author = "Rakesh Agrawal",
title = "Data Mining",
pages = "75--76",
booktitle = "Proceedings of the 13th Symposium on Principles of
Database Systems",
month = may,
publisher = "ACM Press",
address = "New York, NY, USA",
year = "1994",
}
Tutorial: Data Mining, R. Agrawal
@InProceedings{agrawal:tutorial:94,
author = "R. Agrawal",
title = "Tutorial: Data Mining",
editor = "{ACM}",
booktitle = "13th Symposium --- 1994 May: Minneapolis; {MN}",
volume = "13",
publisher = "ACM Press",
address = "New York, NY 10036, USA",
series = "PROCEEDINGS OF THE ACM SIGACT SIGMOD SIGART SYMPOSIUM
ON PRINCIPLES OF DATABASE SYSTEMS 1994",
pages = "75--76",
year = "1994",
keywords = "database systems; ACM; SIGACT; SIGMOD; SIGART;
computability; theory",
}
Machine Learning tutorial (Slides and Anotated Bibliography), David Aha
Available as
hypertext.
@Misc{aha:machine-learning:,
URL = "http://www.aic.nrl.navy.mil/~aha/slides.html",
title = "Machine Learning tutorial (Slides and Anotated
Bibliography)",
author = "David Aha",
annote = "David Aha presented the Machine Learning tutorial at
AI \& Stats 1995. He's kindly put his slides online",
}
Temporal aspects in data mining, Salem Al-naemi
@TechReport{al-naemi:temporal-aspects:92,
author = "Salem Al-naemi",
title = "Temporal aspects in data mining",
institution = "Computer Science Department, University of
Birmingham",
year = "1992/3",
annote = "Sections on RdB's, other temporal models and time
series",
}
Mine for Gold with Parallel Systems, Michael Alexander
@Article{alexander:mine-gold:94,
author = "Michael Alexander",
title = "Mine for Gold with Parallel Systems",
journal = "Datamation",
volume = "40",
number = "22",
pages = "65--??",
day = "15",
month = nov,
year = "1994",
ISSN = "0011-6963",
abstract = "Parallel computing technology has become more
accessible to IS shops with the release of parallelized
versions of popular RDBMSs. With such off-the-shelf
tools, your company can gain competitive advantage
through techniques like data mining that allow you to
more finely analyze and project demand for your
products. But if you're going to need the power of
massively parallel systems, off-the-shelf solutions are
still a few years away.",
}
Partial Classification Using Association Rules, Kamal Ali and Stefanos Manganaris and Ramakrishnan Srikant
@InProceedings{ali.ea:partial-classification:97,
title = "Partial Classification Using Association Rules",
author = "Kamal Ali and Stefanos Manganaris and Ramakrishnan
Srikant",
pages = "115",
crossref = "heckerman.ea:proceedings-third:97",
}
Charter, Robert B. Allen
@Article{allen:charter:95,
author = "Robert B. Allen",
title = "Charter",
journal = "ACM Transactions on Information Systems",
volume = "13",
number = "3",
pages = "235",
year = "1995",
copyright = "(c) Copyright 1995 Association for Computing
Machinery",
abstract = "The ACM Transactions on Information Systems (TOIS)
considers the design, performance, and evaluation of
computer systems that facilitate the presentation of
information in a variety of media, as well as
underlying technologies that support these systems. The
major themes of TOIS and those topics which distinguish
it from other ACM Transactions include: - Information
Retrieval and Information Filtering: Algorithms and
inference mechanisms for search, retrieval, and
presentation of information and models of user
information preferences. - Information Interfaces:
Hypertext and hypermedia interfaces, information
visualization, multimedia presentation, and task and
user models for information systems. - Natural Language
Processing: Computational linguistics and models of
natural language (including content, syntax, semantics,
and dialogue) relevant to information systems. -
Knowledge and Information Representation:
Representation issues for supporting information
systems including semantic and object-oriented
databases, knowledge bases, and hypertext/hypermedia
document models. - Multimedia Information Systems:
Semantics, search, and presentation of media including
audio, image, video, and virtual reality. - Networked
Information Systems: Interfaces and indexing, resource
discovery, and visualization. - Organizational
Interfaces and Social Impact of Information Systems:
Electronic mail; decision and negotiation support
systems; the effects of information system use on
groups, organizations, and communities; social
constraints imposed on information systems such as
legal and privacy concerns. - Design and Evaluation of
Information Systems: Design principles for information
systems, methodologies for evaluating information
systems, and programming languages relevant to
information systems. - Information System Applications:
Electronic books, documents, journals, movies, and
libraries; authoring systems; office information
systems; geographic information systems; and
intelligent tutoring systems.",
}
Knowledge discovery in biomedical databases - a machine induction approach, H. Alnahi and S. Alshawi
@Article{alnahi.ea:biomedical-machine:93,
author = "H. Alnahi and S. Alshawi",
address = "Brunel Univ, Dept Comp Sci, Uxbridge Ub8 3Ph, Middx,
England",
title = "Knowledge discovery in biomedical databases - a
machine induction approach",
journal = "Computer Methods And Programs In Biomedicine",
year = "1993",
volume = "39",
issue = "3-4",
pages = "343--349",
abstract = "The increase in the number and size of available
databases by far exceeds the growth of the
corresponding knowledge. Furthermore, many databases
contain information which is not possessed by an
existing human expert. This creates both a need and an
opportunity for extracting knowledge from databases. An
unsolved problem in molecular biology is the problem of
predicting a protein's secondary structure from its
primary structure. Inductive machine learning is a
search for a plausible general description which can
explain the given input data, and is useful for
predicting new data. In this paper we present a
statistical inductive algorithm which can be used to
produce new rules for predicting multiple protein
secondary structures from protein primary structure
databases.",
keywords = "SECONDARY STRUCTURE, PREDICTION, SEQUENCE, MACHINE
LEARNING, INDUCTION, DATABASES, KNOWLEDGE, RULES,
PROTEIN PRIMARY SECONDARY STRUCTURES, AMINO ACID
RESIDUES",
}
Discovering rules for water demand prediction: an enhanced rough-set approach (reprinted from proceedings of the international joint conference on artificial intelligence), A. J. An and N. Shan and C. Chan and N. Cercone and W. Ziarko
@Article{an.ea:discovering-rules:96,
author = "A. J. An and N. Shan and C. Chan and N. Cercone and W.
Ziarko",
address = "Univ Regina, Dept Comp Sci, Regina, Sk S4S 0A2,
Canada",
title = "Discovering rules for water demand prediction: an
enhanced rough-set approach (reprinted from proceedings
of the international joint conference on artificial
intelligence)",
journal = "Engineering Applications Of Artificial Intelligence",
year = "1996",
volume = "9",
issue = "6",
pages = "645--653",
abstract = "Prediction of consumer demands is a pre-requisite for
optimal control of water distribution systems because
minimum-cost pumping schedules can be computed if water
demands are accurately estimated This paper presents an
enhanced rough-sets method for generating prediction
rules from a set of observed data. The proposed method
extends upon the standard rough set model by making use
of the statistical information inherent in the data to
handle incomplete and ambiguous training samples. It
also discusses some experimental results from using
this method for discovering knowledge on water demand
prediction. Copyright (C) 1996 IJCAI Inc.",
keywords = "water demand prediction, knowledge discovery, rough
sets",
}
Edm - a general framework for data mining based on evidence theory, S. S. Anand and D. A. Bell and J. G. Hughes
@Article{anand.ea:edm-general:96,
author = "S. S. Anand and D. A. Bell and J. G. Hughes",
address = "Univ Ulster, Fac Informat, Sch Informat \& Software
Engn, Jordanstown, North Ireland",
title = "Edm - a general framework for data mining based on
evidence theory",
journal = "Data \& Knowledge Engineering",
year = "1996",
volume = "18",
issue = "3",
pages = "189--223",
abstract = "Data Mining or Knowledge Discovery in Databases
[1,15,23] is currently one of the most exciting and
challenging areas where database techniques are coupled
with techniques from Artificial Intelligence and
mathematical sub-disciplines to great potential
advantage. It has been defined as the non- trivial
extraction of implicit, previously unknown and
potentially useful information from data. A lot of
research effort is being directed towards building
tools for discovering interesting patterns which are
hidden below the surface in databases. However, most of
the work bring done in this field has been
problem-specific and no general framework has yet been
proposed for Data Mining. In this paper we seek to
remedy this by proposing, EDM - Evidence-based Data
Mining - a general framework for Data Mining based on
Evidence Theory. Having a general framework for Data
Mining offers a number of advantages. It provides a
common method for representing knowledge which allows
prior knowledge from the user or knowledge discovered
by another discovery process to be incorporated into
the discovery process. A common knowledge
representation also supports the discovery of meta-
knowledge from knowledge discovered by different Data
Mining techniques. Furthermore, a general framework can
provide facilities that are common to most discovery
processes, e.g. incorporating domain knowledge and
dealing with missing values. The framework presented in
this paper has the following additional advantages. The
framework is inherently parallel. Thus, algorithms
developed within this framework will also be parallel
and will therefore be expected to be efficient for
large data sets - a necessity as most commercial data
sets, relational or otherwise, are very large. This is
compounded by the fact that the algorithms are complex.
Also, the parallelism within the framework allows its
use in parallel, distributed and heterogeneous
databases. The framework is easily updated and new
discovery methods can be readily incorporated within
the framework, making it 'general' in the functional
sense in addition to the representational sense
considered above. The framework provides an intuitive
way of dealing with missing data during the discovery
process using the concept of Ignorance borrowed from
Evidence Theory. The framework consists of a method for
representing data and knowledge, and methods for data
manipulation or knowledge discovery(1). We suggest an
extension of the conventional definition of mass
functions in Evidence Theory for use in Data Mining, as
a means to represent evidence of the existence of rules
in the database. The discovery process within EDM
consists of a series of operations on the mass
functions. Each operation is carried out by an EDM
operator. We provide a classification for the EDM
operators based on the discovery functions performed by
them and discuss aspects of the induction, domain and
combination operator classes. The application of EDM to
two separate Data Mining tasks is also addressed,
highlighting the advantages of using a general
framework for Data Mining in general and, in
particular, using one that is based on Evidence
Theory.",
keywords = "DATA MINING, KNOWLEDGE DISCOVERY IN DATABASES,
UNCERTAINTY HANDLING, EVIDENCE THEORY, PARALLEL
DISCOVERY",
}
A High-Performance Data Mining Server, S. S. Anand and D. A. Bell and J. G. Hughes and C. M. Shapcott
@Article{anand.ea:high-performance-server:96,
author = "S. S. Anand and D. A. Bell and J. G. Hughes and C. M.
Shapcott",
title = "A High-Performance Data Mining Server",
journal = "Lecture Notes in Computer Science",
volume = "1067",
pages = "907--??",
year = "1996",
ISSN = "0302-9743",
}
Data mining in parallel, S. S. Anand and C. Shapcott and D. Bell and J. Hughes
@InProceedings{anand.ea:parallel:95,
author = "S. S. Anand and C. Shapcott and D. Bell and J.
Hughes",
title = "Data mining in parallel",
volume = "44",
series = "Transputer and Occam Engineering",
pages = "113--124",
booktitle = "Proceedings of WoTUG-18: Transputer and occam
Developments",
year = "1995",
publisher = "IOS Press",
address = "Amsterdam",
month = apr,
ISBN = "ISBN 90-5199-222-x",
}
Getting to grips with arrears: `data mining' systems at the Leeds, anonymous
@Article{anonymous:getting-to:94,
author = "anonymous",
title = "Getting to grips with arrears: `data mining' systems
at the {L}eeds",
journal = "Expert Systems",
year = "1994",
volume = "11",
number = "2",
pages = "122--124",
month = may,
keywords = "Applications, Data mining, kdd, Attar Software, Xpert
Rule Analyser",
}
Data Mining: Intelligent Technology Gets down to Business, anonymous
@Article{anonymous:intelligent-technology:93,
author = "anonymous",
title = "Data Mining: Intelligent Technology Gets down to
Business",
journal = "PC AI",
year = "1993",
month = nov # " - " # dec,
}
Lessons in Data Mining, Anonymous
@Article{anonymous:lessons:97,
author = "Anonymous",
title = "Lessons in Data Mining",
journal = "Byte Magazine",
volume = "22",
number = "2",
pages = "40--??",
month = feb,
year = "1997",
ISSN = "0360-5280",
}
SIGMOD '93. 1993 ACM SIGMOD. International Conference on Management of Data, Anonymous (Ed)
@Proceedings{anonymous:sigmod-93:93,
editor = "Anonymous",
booktitle = "SIGMOD '93. 1993 ACM SIGMOD. International Conference
on Management of Data",
title = "{SIGMOD} '93. 1993 {ACM} {SIGMOD}. International
Conference on Management of Data",
volume = "22(2)",
month = jun,
publisher = "ACM Press",
address = "New York, NY 10036, USA",
year = "1993",
ISSN = "0163-5808",
series = "SIGMOD Record (ACM Special Interest Group on
Management of Data)",
classification = "C6160 (Database management systems (DBMS)); C4250
(Database theory); C7250 (Information storage and
retrieval); C6170 (Expert systems); C6120 (File
organisation); C6140D (High level languages); C6130
(Data handling techniques); C6150G (Diagnostic,
testing, debugging and evaluating systems)",
confdate = "26--28 May 1993",
conflocation = "Washington, DC, USA",
confsponsor = "ACM",
keywords = "Benchmark programs; Database rules; Integrity; Join
processing; Object-oriented databases; Memory-based
implementations; DBMS implementation issues; Recovery;
Knowledge discovery; Temporal reasoning; Data
compression; Query optimisation; Secondary storage
techniques; Search structures; Query languages;
Interfaces; Intelligent/deductive DBMSs;
Relational/parallel DBMS processing; Transaction
management; Object/scientific DBMSs; Interoperability",
thesaurus = "Data compression; Database management systems;
Database theory; Inference mechanisms; Knowledge based
systems; Program testing; Query languages; Query
processing; Storage management; System recovery;
Transaction processing",
}
Supercomputers Knock At IS Doors, Anonymous
@Article{anonymous:supercomputers-knock-at-is-doors:92,
author = "Anonymous",
title = "{Supercomputers Knock At {IS} Doors}",
journal = "Datamation",
volume = "38",
number = "24",
pages = "79--??",
day = "01",
month = dec,
year = "1992",
ISSN = "0011-6963",
abstract = "Cost-effective massively parallel designs gain
converts for data mining and OLTP applications among
leading edge users and traditional systems suppliers.",
}
Computational learning theory: an introduction, Martin Anthony and Norman Biggs
@Book{anthony.ea:computational-learning:92,
author = "Martin Anthony and Norman Biggs",
title = "Computational learning theory: an introduction",
year = "1992",
publisher = "Cambridge University Press",
series = "Cambridge Tracts in Theoretical Computer Science",
volume = "30",
}
Knowledge Mining by Imprecise Querying: A Classification-based System, T. M. Anwar and H. W. Beck and S. B. Navathe
@InProceedings{anwar.ea:by-imprecise:92,
author = "T. M. Anwar and H. W. Beck and S. B. Navathe",
title = "Knowledge Mining by Imprecise Querying: {A}
Classification-based System",
booktitle = "Proceedings of the International Conference on Data
Engineering",
address = "Tempe, AZ",
month = feb,
year = "1992",
pages = "622--630",
abstract = "Knowledge mining is the process of discovering new
knowledge that is hitherto unknown. Users with a lack
of knowledge of database schemas engage in the process
of knowledge mining by posing imprecise queries. An
approach to knowledge mining by imprecise querying is
presented that utilizes conceptual clustering
techniques. In contrast to numeric or fuzzy set
approaches which ultimately rely on some distance
metric and threshold to processing such queries,
conceptual clustering retrieves instances which are
structurally, semantically, and pragmatically similar
to the query even though they may not match the
requirements exactly. The query processor has both a
deductive and inductive component. The deductive
component finds precise matches in the traditional
sense, and the inductive component identifies ways in
which imprecise matches may be considered similar.
Ranking on similarity is done using the database
taxonomy, by which similar instances become members of
the same class. Relative similarity is determined by
depth in the taxonomy. The conceptual clustering
algorithm, its use in query processing and an example
are presented.",
}
Sales surge as mainframes find a role in client\slash server, E. L. Appleton
@Article{appleton:sales-surge:95,
author = "E. L. Appleton",
title = "Sales surge as mainframes find a role in client\slash
server",
journal = "Datamation",
volume = "41",
number = "10",
pages = "48",
month = jun,
year = "1995",
ISSN = "0011-6963",
classification = "D5010 (Computers and work stations); D5020 (Computer
networks and intercomputer communications)",
keywords = "Mainframes; Client/server; Demand; Economy;
Large-system market; Vendors; IBM Parallel Sysplex;
UNIX server; NT server; Pyramid; HP T-500; Data mining;
Parallelism; IBM Power Parallel; Amdahl ECL mainframe",
language = "English",
pubcountry = "USA",
thesaurus = "Client-server systems; DP industry; Mainframes",
}
Predicting defects in Disk Drive Manufacturing: a case study in High-Dimensional Classification, Chidanand Apt\'e and Sholom Weiss and Gordon Grout
@InProceedings{apte.ea:predicting-defects:93,
author = "Chidanand Apt\'e and Sholom Weiss and Gordon Grout",
title = "Predicting defects in Disk Drive Manufacturing: a case
study in High-Dimensional Classification",
booktitle = "Proceedings of the 9th Conference on Artificial
Intelligence for Applications",
pages = "212--218",
address = "Orlando, Florida",
year = "1993",
}
A Linear Method for Deviation Detection in Large Databases, Andreas Arning and Rakesh Agrawal and Prabhakar Raghavan
@InProceedings{arning.ea:linear-method:96,
title = "A Linear Method for Deviation Detection in Large
Databases",
pages = "164",
author = "Andreas Arning and Rakesh Agrawal and Prabhakar
Raghavan",
crossref = "simoudis.ea:proceedings-second:96",
}
Exploiting Background Knowledge in Automated Discovery, John M. Aronis and Foster J. Provost and Bruce G. Buchanan
@InProceedings{aronis.ea:exploiting-background:96,
title = "Exploiting Background Knowledge in Automated
Discovery",
pages = "355",
author = "John M. Aronis and Foster J. Provost and Bruce G.
Buchanan",
crossref = "simoudis.ea:proceedings-second:96",
}
Increasing the Efficiency of Data Mining Algorithms with Breadth-First Marker Propagation, John M. Aronis and Foster J. Provost
@InProceedings{aronis.ea:increasing-efficiency:97,
title = "Increasing the Efficiency of Data Mining Algorithms
with Breadth-First Marker Propagation",
author = "John M. Aronis and Foster J. Provost",
pages = "119",
crossref = "heckerman.ea:proceedings-third:97",
}
Data mining for lead identification and explosion, S. Ash and S. Gothe
@Article{ash.ea:lead-identification:97,
author = "S. Ash and S. Gothe",
address = "Tripos Inc, St Louis, Mo, 63144",
title = "Data mining for lead identification and explosion",
journal = "Abstracts Of Papers Of The American Chemical Soc.",
year = "1997",
volume = "213",
issue = "Pt1",
pages = "57--CINF",
}
Managing Complexity in Large Data Bases Using Self-Organizing Maps, Barbro Back and Mikko Irjala and Kaisa Sere and Hannu Vanharanta
Available as
hypertext.
@TechReport{back.ea:managing-complexity:96,
author = "Barbro Back and Mikko Irjala and Kaisa Sere and Hannu
Vanharanta",
title = "Managing Complexity in Large Data Bases Using
Self-Organizing Maps",
institution = "TUCS - Turku Centre for Computer Science",
number = "TUCS-TR-48",
month = oct # " 23",
year = "1996",
keywords = "neural networks, self-organizing maps, data bases,
benchmarking",
URL = "http://www.tucs.abo.fi/publications/techreports/TR48.html",
abstract = "The amount of financial information in today's
sophisticated large data bases is huge and makes
comparisons between company performance - especially
over time - difficult or at least very time consuming.
The aim of this paper is to invest igate whether neural
networks in the form of self-organizing maps can be
used to manage the complexity in large data bases. We
structure and analyze accoun ting numbers in a large
data base over several time periods. By using self
organizing maps, we overcome the problems associated
with finding the appropriate und erlying distribution
and the functional form of the underlying data in the
structuring task that is often encountered, for
example, when using cluster analysis. The method chosen
also offers a way of visualizing the results. The data
base in this study consists of annual reports of more
than 120 world wide forest companies with data from a
five year time period. This paper is an extended
version of our paper Data Mining Accambis Numbers Using
Self Organising Maps presented at Finnish Artificial
Intelligenc e Conference in Vasa 20-23 August 1996.",
}
ReDuce: Automatic Structuring and Compression in Relational Databases, B. Bain and C. Sammut and A. Sharma and J. Shepherd
@InProceedings{bain.ea:reduce-automatic:96,
author = "B. Bain and C. Sammut and A. Sharma and J. Shepherd",
title = "{R}e{D}uce: {A}utomatic Structuring and Compression in
Relational Databases",
booktitle = "Proceedings of the MLnet Familiarization Workshop on
Data Mining with Inductive Logic Programing",
pages = "41--52",
year = "1996",
}
Knowledge from data using fuzzy methods, J. F. Baldwin
@Article{baldwin:using-fuzzy:96,
author = "J. F. Baldwin",
address = "Univ Bristol, Dept Engn Math, Bristol, Avon, England",
title = "Knowledge from data using fuzzy methods",
journal = "Pattern Recognition Letters",
year = "1996",
volume = "17",
issue = "6",
pages = "593--600",
abstract = "The basic concept of a data browser is explained and
some methods are described which are suitable for
extracting knowledge from data as an induction process.
The data browser gives data mining capabilities but
also provides a stage for computers and users to act
out their parts in this knowledge discovery process.",
}
From molecules to models to data mining, N. Basta
@Article{basta:molecules-to:96,
author = "N. Basta",
address = "Us Dept Def, Off Infosec Comp Sci, Ft George G Meade,
Md, 20755",
title = "From molecules to models to data mining",
journal = "Chemical Engineering",
year = "1996",
volume = "103",
issue = "2",
pages = "5--5",
}
Brute-Force Mining of High-Confidence Classification Rules, Jr. Roberto J. Bayardo
@InProceedings{bayardo:brute-force-high-confidence:97,
title = "Brute-Force Mining of High-Confidence Classification
Rules",
author = "Jr. Roberto J. Bayardo",
pages = "123",
crossref = "heckerman.ea:proceedings-third:97",
}
Discovery and Maintenance of Functional Dependencies by Independencies, S. Bell
@InProceedings{bell:maintenance-functional:95,
author = "S. Bell",
title = "Discovery and Maintenance of Functional Dependencies
by Independencies",
booktitle = "Proceedings of the Workshop on Knowledge Discovery in
Databases",
pages = "27--32",
publisher = "AAAI Press",
year = "1995",
}
From data properties to evidence, D. A. Bell
@Article{bell:properties-to:93,
author = "D. A. Bell",
address = "Univ Ulster, Dept Informat Sci, Jordanstown Bt37 0Qb,
Antrim, North Ireland",
title = "From data properties to evidence",
journal = "Ieee Trans. On Knowledge And Data Engineering",
year = "1993",
volume = "5",
issue = "6",
pages = "965--969",
abstract = "Information and knowledge in computerized information
systems are often characterized by uncertainty. The
facts needed for some realistic applications are
unavailable or are crudely estimated or judged. This
problem manifests itself frequently in information
systems centered on databases. We describe here an
exploration of an aspect of the problem of handling
uncertain evidence on which reasoning is to be based.
We focus upon the problem of making decisions among
propositions based on both uncertain data items (in
contrast to data in conventional databases) and
arguments which are not certain. The primary knowledge
discovery issue we address is a classification problem
- which classification does the available evidence
support? The method investigated here seeks to exploit
information available from conventional database
systems - namely, the integrity assertions or data
dependency information contained in the database. This
information, e.g., from functional dependencies and a
form of multivalued dependencies, allows us to rank
arguments in terms of their strengths. Hence, as a step
in the process of discovering classification knowledge,
using a database as a secondary knowledge discovery
exercise, we explicate latent knowledge pertinent to
arguments of relevance to the purpose at hand. This is
called evidence. Information is requested via user
prompts from an evidential reasoner. It is fed as
evidence to the reasoner. An object-oriented structure
for managing evidence is used to model the conclusion
space and to reflect the evidence structure. The
implementation of the evidence structure and an example
of its use are outlined.",
keywords = "CLASSIFICATION, DATA DEPENDENCIES, DATABASE, EVIDENCE
BASE, EVIDENTIAL REASONING, INTEGRITY CONSTRAINTS",
}
Value-added databases: knowledge discovery and evidential reasoning., D. Bell
@InProceedings{bell:value-added-evidential:94,
title = "Value-added databases: knowledge discovery and
evidential reasoning.",
author = "D. Bell",
booktitle = "Proceedings of the International Workshop on Advances
in Databases and Information Systems - {ADBIS'94}",
address = "Moscow",
year = "1994",
month = may # " 23--26",
pages = "2--9",
abstract = "Results of research into methods of managing evidence
can be coupled with the power and capacity of data
management systems to give a potent approach to
discovering interesting but hidden patterns in large
collections of data. We present some pertinent results
from evidence theory and its applications, and suggest
an approach to the exploitation of these results in the
discovery of knowledge which is held in databases. In
this sense we {\em add value} to databases, which
presumably already justify their existence, and hence
further increase the attractiveness of very large
database systems.",
}
An Examination of Inductive Learning Algorithms for the Classification of Sleep Signals, John A. Bentrup and Sylvian R. Ray
Available as
compressed postscript.
@TechReport{bentrup.ea:examination-inductive:93,
author = "John A. Bentrup and Sylvian R. Ray",
title = "An Examination of Inductive Learning Algorithms for
the Classification of Sleep Signals",
institution = "Department of Computer Science, University of Illinois
at Urbana-Champaign",
type = "Report.",
number = "UIUCDCS-R-93-1792",
address = "1304 Springfield Avenue, Urbana, Il 61801",
month = feb,
year = "1993",
URL = "ftp://a.cs.uiuc.edu/pub/TechReports/UIUCDCS-R-93-1792.ps.Z",
note = "Modified version to appear in Proceedings of the 30th
Annual Rocky Mountain Bioengineering Symposium (April
1993).",
annote = "Nine inductive learning algorithms are tested on sleep
signals of 161 subjects. Algorithms are ID3, C4, CART,
MDL, AIMS, Bayes, PLS(K), PRG, Nearest Neighbour and
COBWEB. Nice table summarising algorithms.",
}
Integrated Learning in a Real Domain, F. Bergadano and A. Giordana and L. Saitta
@InCollection{bergadano.ea:integrated-learning:91,
editor = "Gregory Piatetsky-Shapiro and William J. Frawley",
booktitle = "Knowledge Discovery in Databases",
publisher = "AAAI Press / The MIT Press",
address = "Menlo Park, California",
edition = "1st",
year = "1991",
author = "F. Bergadano and A. Giordana and L. Saitta",
title = "Integrated Learning in a Real Domain",
pages = "277--288",
}
Applying Data Mining and Machine Learning Techniques to Submarine Intelligence Analysis, Ulla Bergsten and Johan Schubert and Per Svensson
@InProceedings{bergsten.ea:applying-machine:97,
title = "Applying Data Mining and Machine Learning Techniques
to Submarine Intelligence Analysis",
author = "Ulla Bergsten and Johan Schubert and Per Svensson",
pages = "127",
crossref = "heckerman.ea:proceedings-third:97",
}
Hot Topics: Customizing information. 2. How successful are we so far?, D. Berleant and H. Berghel
@Article{berleant.ea:hot-topics:94,
author = "D. Berleant and H. Berghel",
title = "Hot Topics: Customizing information. 2. {How}
successful are we so far?",
journal = "Computer",
volume = "27",
number = "10",
pages = "76--78",
month = oct,
year = "1994",
ISSN = "0018-9162",
affiliation = "Dept. of Comput. Syst. Eng., Arkansas Univ.,
Fayetteville, AR, USA",
classification = "C6130D (Document processing techniques); C7210
(Information services and centres); C7250N (Front end
systems for online searching)",
keywords = "Advanced information customization; Browsing; Data
interchange; Digital library; Document customization;
Filtering; Hypermedia; Hypertext; Information analysis;
Information extraction; Information retrieval;
Information science; Information-customizing
interfaces; Interactivity; Knowledge discovery;
Nonprescriptive structuring",
thesaurus = "Document handling; Full-text databases; Hypermedia;
Information retrieval; Online front-ends",
}
Enactment in Information Farming, Mark Bernstein
@InProceedings{bernstein:enactment-information:93,
author = "Mark Bernstein",
title = "Enactment in Information Farming",
booktitle = "Proceedings of ACM Hypertext'93",
series = "Technical Briefings",
pages = "242--249",
year = "1993",
copyright = "(c) Copyright 1993 Association for Computing
Machinery",
keywords = "Design, Rhetoric, Enactment, Collaboration,
Information farming",
abstract = "Information farming views the cultivation of
information as a continuing, collaborative activity
performed by groups of people working together to
achieve changing individual and common goals. Failure
to differentiate information farming from related but
distinct activities like information mining and data
factories has been a fruitful source of
misunderstanding and discord in the hypertext
literature and in the design of hypertext environments.
Dramatic enactment and visual salience -- not recall,
precision, or usability -- assume primary roles in
design for information gardening. In this technical
briefing, we examine how enactment contribute to the
success and failure of a variety of Hypergate and
Storyspace features.",
}
Computational Methods for Intelligent Information Access, Michael W. Berry and Susan T. Dumais and Todd A. Letsche
@InProceedings{berry.ea:computational-methods:95,
author = "Michael W. Berry and Susan T. Dumais and Todd A.
Letsche",
title = "Computational Methods for Intelligent Information
Access",
booktitle = "Proceedings of Supercomputing'95",
publisher = "ACM/IEEE",
address = "San Diego, CA",
month = dec,
year = "1995",
keywords = "data mining, indexing, information, latent, matrices,
retrieval, semantic, singular value decomposition
(SVD), sparse, updating,",
abstract = "ps/PDF on the CD with MPEG.",
}
Testing Complex Temporal Relationships Involving Multiple Granularities and Its Application to Data Mining, C. Bettini and X. Sean Wang and S. Jajodia
@InProceedings{bettini.ea:testing-complex:96,
author = "C. Bettini and X. {Sean Wang} and S. Jajodia",
title = "Testing Complex Temporal Relationships Involving
Multiple Granularities and Its Application to Data
Mining",
editor = "{ACM}",
booktitle = "Proceedings of the Fifteenth {ACM}
{SIGACT}-{SIGMOD}-{SIGART} Symposium on Principles of
Database Systems, {PODS} 1996, Montr{\'e}al, Canada,
June 3--5, 1996",
volume = "15",
publisher = "ACM Press",
address = "New York, NY 10036, USA",
year = "1996",
series = "Proceedings of the ACM SIGACT SIGMOD SIGART Symposium
on Principles of Database Systems",
pages = "68--78",
annote = "Held in conjunction with the 1996 ACM SIGMOD
international conference on management of data. Also
known as PODS 1996",
keywords = "database systems; PODS; ACM; SIGMOD; SIGART; SIGACT",
}
Time-dependent concepts: representation and reasoning using temporal description logics, C. Bettini
@Article{bettini:time-dependent-concepts:97,
author = "C. Bettini",
address = "Univ Milan, Dipartimento Sci Informaz, I-20122 Milan,
Italy",
title = "Time-dependent concepts: representation and reasoning
using temporal description logics",
journal = "Data \& Knowledge Engineering",
year = "1997",
volume = "22",
issue = "1",
pages = "1--38",
abstract = "A time-dependent concept is a conceptual entity that
is defined in terms of temporal relationships with
other entities. For example, the concept of an action
is defined in terms of a set of temporal relationships
among states of a system. The concept of ''widow'', in
natural language, is defined in terms of events that
have occurred in the past. Time-dependent concepts
appear in several application areas, from natural
language to diagnosis, from planning to data mining. An
interesting issue in knowledge representation is how to
formally represent and reason with these concepts. In
this paper, we represent a family of formal
representation languages obtained as an interval-based
temporal extension of description logics. We illustrate
the expressiveness of these formalisms in representing
time-dependent concepts with respect to standard
description logics and other extensions. We give some
complexity results for reasoning problems and we
propose approximate algorithms to compute subsumption
among time-dependent concepts.",
keywords = "INTERVALS, temporal knowledge, temporal reasoning,
description logics, taxonomies, subsumption algorithms,
temporal objects",
}
Advanced Scout: Data Mining and Knowledge Discovery in NBA data, Inderpal Bhandari and Ed Colet and Jennifer Parker and Zachary Pines and Rajiv Pratap and Krishnakumar Ramanujam
@Article{bhandari.ea:advanced-scout:97,
author = "Inderpal Bhandari and Ed Colet and Jennifer Parker and
Zachary Pines and Rajiv Pratap and Krishnakumar
Ramanujam",
title = "Advanced Scout: Data Mining and Knowledge Discovery in
{NBA} data",
journal = "Data Mining and Knowledge Discovery",
year = "1997",
volume = "1",
number = "1",
annote = "Advanced Scout is a PC-based data mining application
used by National Basketball Association (NBA) coaching
staffs to discover interesting patterns in basketball
game data. We describe Advanced Scout software from the
perspective of data mining and knowledge discovery.
This paper highlights the pre-processing of raw data
that the program performs, describes the data mining
aspects of the software and how the interpretation of
patterns supports the process of knowledge discovery.
The underlying technique of attribute focusing as the
basis of the algorithm is also described. The process
of pattern interpretation is facilitated by allowing
the user to relate patterns to video tape.",
}
A case-study of software process improvement during development, I. Bhandari and M. Halliday and E. Tarver and D. Brown and J. Chaar and R. Chillarege
@Article{bhandari.ea:case-study-software:93,
author = "I. Bhandari and M. Halliday and E. Tarver and D. Brown
and J. Chaar and R. Chillarege",
address = "Ibm Corp, Thomas J Watson Res Ctr, Yorktown Hts, Ny,
10598 Ibm Corp, Mid Hudson Valley Programming Lab,
Wappingers Falls, Ny, 12590",
title = "A case-study of software process improvement during
development",
journal = "Ieee Trans. On Software Engineering",
year = "1993",
volume = "19",
issue = "12",
pages = "1157--1170",
abstract = "We present a case study of the use of a software
process improvement method which is based on the
analysis of defect data. The first step of the method
is the classification of software defects using
attributes which relate defects to specific process
activities. Such classification captures the semantics
of the defects in a fashion which is useful for process
correction. The second step utilizes a machine-
assisted approach to data exploration which allows a
project team to discover such knowledge from defect
data as is useful for process correction. We show that
such analysis of defect data can readily lead a project
team to improve their process during development.",
keywords = "CYCLE, DATE EXPLORATION, DEFECT-BASED PROCESS
IMPROVEMENT, IN-PROCESS METRICS, KNOWLEDGE DISCOVERY",
}
Attribute focusing - machine-assisted knowledge discovery applied to software production process-control, I. Bhandari
@Article{bhandari:attribute-focusing:94,
author = "I. Bhandari",
address = "Ibm Corp, Thomas J Watson Res Ctr, Yorktown Hts, Ny,
10598",
title = "Attribute focusing - machine-assisted knowledge
discovery applied to software production
process-control",
journal = "Knowledge Acquisition",
year = "1994",
volume = "6",
issue = "3",
pages = "271--294",
abstract = "How can people who are not trained in data analysis
discover knowledge from a database of attribute-valued
data? I address this question by presenting a
man-machine approach to knowledge discovery called
Attribute Focusing and its application to software
production process control. Attribute Focusing utilizes
an automatic filter to focus attention on that small
part of a large amount of data which is interesting. A
person studies that part in a manner which leads him to
discover knowledge about the physical situation to
which the data pertain. Specifically, the paper
describes: 1. A model of interestingness of data based
on the magnitude of data values, the association of
data values and basic knowledge of the limits of human
processing. 2. The use of that model of interestingness
by people to discover knowledge. 3. The application of
the Attribute Focusing approach to diagnose and correct
the software production process. Based on the results
that have been observed, the paper concludes that
man-machine approaches to knowledge discovery should be
emphasized much more than has been in the past, and
that Attribute Focusing is a powerful, practical
approach to such discovery.",
}
Data mining, N. Bissantz and J. Hagedorn
@Article{bissantz.ea:data-mining:93,
author = "N. Bissantz and J. Hagedorn",
address = "Ibm Corp, Thomas J Watson Res Ctr, Yorktown Hts, Ny,
10598 Ibm Corp, Mid Hudson Valley Programming Lab,
Wappingers Falls, Ny, 12590",
title = "Data mining",
journal = "Wirtschaftsinformatik",
year = "1993",
volume = "35",
issue = "5",
pages = "481--487",
}
Relational knowledge discovery in databases, H. Blockeel and L. De Raedt
@InProceedings{blockeel.ea:relational:96,
author = "H. Blockeel and L. De Raedt",
title = "Relational knowledge discovery in databases",
booktitle = "Proceedings of the 6th International Workshop on
Inductive Logic Programming",
editor = "S. Muggleton",
publisher = "Stockholm University, Royal Institute of Technology",
pages = "1--13",
year = "1996",
}
Discovery, Confirmation and Incorporation of Causal Relationships from a Large Time-Oriented Clinical Database: The RX Project, Robert L. Blum
@Article{blum:confirmation-incorporation:82,
author = "Robert L. Blum",
title = "Discovery, Confirmation and Incorporation of Causal
Relationships from a Large Time-Oriented Clinical
Database: The {RX} Project",
journal = "Computers and Biomedical Research",
volume = "15",
pages = "164--187",
year = "1982",
}
Discovery and Representation of Causal Relationships from a Large Time-Oriented Clinical Database: The RX Project, Robert L. Blum
@Book{blum:representation-causal:82,
author = "Robert L. Blum",
title = "Discovery and Representation of Causal Relationships
from a Large Time-Oriented Clinical Database: The {RX}
Project",
year = "1982",
publisher = "Spinger-Verlag",
series = "Lecture Notes in Medical Informatics",
volume = "19",
}
Occam's Razor, Anselm Blumer and Andrzej Ehrenfeucht and David Haussler and Manfred K. Warmuth
@Article{blumer.ea:occams-razor:87,
author = "Anselm Blumer and Andrzej Ehrenfeucht and David
Haussler and Manfred K. Warmuth",
title = "Occam's Razor",
journal = "Information processing letters",
volume = "24",
pages = "377--380",
year = "1987",
}
Process-Based Database Support for the Early Indicator Method,
@InProceedings{breitner.ea:process-based-database:97,
title = "Process-Based Database Support for the Early Indicator
Method",
author = "Christoph Breitner and J{\"{o}}rg Schl{\"{o}}sser and
R{\"{u}}diger Wirth",
pages = "131",
crossref = "heckerman.ea:proceedings-third:97",
}
SAMIA: a bottom-up learning method using a simulated annealing algorithm, Pierre Br\'ezellec and Henri Soldano
@InProceedings{brezellec.ea:samia-bottom-up:93,
author = "Pierre Br\'ezellec and Henri Soldano",
title = "{SAMIA}: a bottom-up learning method using a simulated
annealing algorithm",
booktitle = "Proceedings of the European conference on Machine
Learning",
series = "Lecture notes in Artificial Intelligence",
pages = "297--309",
publisher = "Springer-verlag",
year = "1993",
}
Direct Access of an ILP Algorithm to a Database Management System, P. Brockhausen and K. Morik
@InProceedings{brockhausen.ea:direct-access:96,
author = "P. Brockhausen and K. Morik",
title = "Direct Access of an {ILP} Algorithm to a Database
Management System",
booktitle = "Proceedings of the MLnet Familiarization Workshop on
Data Mining with Inductive Logic Programing",
pages = "95--110",
year = "1996",
}
Applying classification algorithms in practice (preprint), C. E. Brodley and P. Smyth
Available as
hypertext.
@Article{brodley.ea:applying-classification:,
author = "C. E. Brodley and P. Smyth",
title = "Applying classification algorithms in practice
(preprint)",
journal = "(To appear) Statistics and Computing",
URL = "http://yake.ecn.purdue.edu/~brodley/my-papers/publications.html",
}
Distributed Information Management in the National HPCC Software Exchange, Shirley Browne and Jack Dongarra and Geoffrey C. Fox and Ken Hawick and Ken Kennedy and Rick Stevens and Robert Olson and Tom Rowan
@InProceedings{browne.ea:distributed-information:95,
author = "Shirley Browne and Jack Dongarra and Geoffrey C. Fox
and Ken Hawick and Ken Kennedy and Rick Stevens and
Robert Olson and Tom Rowan",
title = "Distributed Information Management in the National
{HPCC} Software Exchange",
booktitle = "Proceedings of Supercomputing'95",
publisher = "ACM/IEEE",
address = "San Diego, CA",
month = dec,
year = "1995",
keywords = "data mining, information management, information
retrieval, HPCC, high performance computing, software
repository,",
abstract = "Simple html document on CD.",
}
MineSet: An Integrated System for Data Mining, Cliff Brunk and James Kelly and Ron Kohavi
@InProceedings{brunk.ea:mineset-integrated:97,
title = "MineSet: An Integrated System for Data Mining",
author = "Cliff Brunk and James Kelly and Ron Kohavi",
pages = "135",
crossref = "heckerman.ea:proceedings-third:97",
}
A guide to the literature on learning probabilistic networks from data, W. Buntine
@Article{buntine:guide-to:96,
author = "W. Buntine",
address = "Thinkbank, 1678 Shattuck Ave, Suite 320, Berkeley, Ca,
94709",
title = "A guide to the literature on learning probabilistic
networks from data",
journal = "Ieee Trans. On Knowledge And Data Engineering",
year = "1996",
volume = "8",
issue = "2",
pages = "195--210",
abstract = "This literature review discusses different methods
under the general rubric of learning Bayesian networks
from data, and includes some overlapping work on more
general probabilistic networks. Connections are drawn
between the statistical, neural network, and
uncertainty communities, and between the different
methodological communities, such as Bayesian,
description length, and classical statistics. Basic
concepts for learning and Bayesian networks are
introduced and methods are then reviewed. Methods are
discussed for learning parameters of a probabilistic
network, for learning the structure, and for learning
hidden variables. The presentation avoids formal
definitions and theorems, as these are plentiful in the
literature, and instead illustrates key concepts with
simplified examples.",
keywords = "EXPERT-SYSTEMS, BAYESIAN NETWORKS, GRAPHICAL MODELS,
INDEPENDENCE, COMPLEXITY, BAYESIAN NETWORKS, GRAPHICAL
MODELS, HIDDEN VARIABLES, LEARNING, LEARNING STRUCTURE,
PROBABILISTIC NETWORKS, KNOWLEDGE DISCOVERY",
}
Attribute-Oriented Induction in Relational Databases, Yandong Cai and Nick Cercone and Jaiwei Han
@InCollection{cai.ea:attribute-oriented-induction:91,
editor = "Gregory Piatetsky-Shapiro and William J. Frawley",
booktitle = "Knowledge Discovery in Databases",
publisher = "AAAI Press / The MIT Press",
address = "Menlo Park, California",
edition = "1st",
year = "1991",
author = "Yandong Cai and Nick Cercone and Jaiwei Han",
title = "Attribute-Oriented Induction in Relational Databases",
pages = "213--228",
}
An overview of machine learning, Jaime G. Carbonell and Ryszard S. Michalski and Tom M. Mitchell
@InCollection{carbonell.ea:overview-machine:83,
author = "Jaime G. Carbonell and Ryszard S. Michalski and Tom M.
Mitchell",
title = "An overview of machine learning",
pages = "3--24",
crossref = "michalski.ea:machine-learning:83",
}
Assessing Credit Card Applications Using Machine Learning, Chris Carter and Jason Catlett
@Article{carter.ea:assessing-credit:87,
author = "Chris Carter and Jason Catlett",
title = "Assessing Credit Card Applications Using Machine
Learning",
journal = "IEEE Expert",
pages = "71--79",
volume = "Fall 1987",
year = "1987",
}
A fast, online generalization algorithm for knowledge discovery, C. L. Carter and H. J. Hamilton
@Article{carter.ea:fast-online:95,
author = "C. L. Carter and H. J. Hamilton",
address = "Univ Regina, Dept Comp Sci, Regina, Sk S4S 0A2,
Canada",
title = "A fast, online generalization algorithm for knowledge
discovery",
journal = "Applied Mathematics Letters",
year = "1995",
volume = "8",
issue = "2",
pages = "5--11",
abstract = "We present an O(n) algorithm for generalizing a
database relation using concept hierarchies, where n is
the number of tuples in the input relation. The
algorithm is based on a variant of Han et al.'s
attribute-oriented O(n log n) algorithm. Our algorithm
is an on-line algorithm; fast performance is achieved
because after encountering a tuple and generalizing it,
the location of the appropriate counter to increment is
calculated instead of searched for.",
keywords = "KNOWLEDGE DISCOVERY, DATA MINING, DATABASES, CONCEPT
HIERARCHIES, GENERALIZATION",
}
Megainduction: machine learning on very large databases, Jason Catlett
Available as
hypertext.
@PhdThesis{catlett:megainduction-machine:91,
title = "Megainduction: machine learning on very large
databases",
year = "1991",
author = "Jason Catlett",
URL = "http://www.research.att.com/orgs/ssr/people/catlett/phd.html",
}
IEEE Transactions on Knowledge and Data Engineering Special issue on Learning and Discovery in Databases, N. Cercone and M. Tsuchiya (guest editors) (Eds)
@Article{cercone.ea:ieee-transactions:93,
key = "cercone.ea:ieee-transactions:93",
title = "{IEEE} Transactions on Knowledge and Data Engineering
Special issue on Learning and Discovery in Databases",
journal = "IEEE Transactions on Knowledge and Data Engineering",
year = "1993",
volume = "5",
number = "6",
month = dec,
editor = "N. Cercone and M. Tsuchiya (guest editors)",
note = "Special issue on Learning and Discovery in Databases",
}
Proposal and Empirical Comparison of a Parallelizable Distance-Based Discretization Method, Jes\'us Cerquides and Ramon L\'opez de M\`antaras
@InProceedings{cerquides.ea:proposal-empirical:97,
title = "Proposal and Empirical Comparison of a Parallelizable
Distance-Based Discretization Method",
author = "Jes\'{u}s Cerquides and Ramon L\'{o}pez de
M\`{a}ntaras",
pages = "139",
crossref = "heckerman.ea:proceedings-third:97",
}
Experiments in Multistrategy Learning by Meta-Learning, Philip K. Chan and Salvatore J. Stolfo
@InProceedings{chan.ea:experiments-multistrategy:93,
author = "Philip K. Chan and Salvatore J. Stolfo",
title = "Experiments in Multistrategy Learning by
Meta-Learning",
booktitle = "Proceedings of the second international conference on
information and knowledge management",
pages = "314--323",
address = "Washington, DC",
year = "1993",
}
Sharing Learned Models among Remote Database Partitions by Local Meta-Learning, Philip K. Chan and Salvatore J. Stolfo
@InProceedings{chan.ea:sharing-learned:96,
title = "Sharing Learned Models among Remote Database
Partitions by Local Meta-Learning",
pages = "2",
author = "Philip K. Chan and Salvatore J. Stolfo",
crossref = "simoudis.ea:proceedings-second:96",
}
Model uncertainty, data mining and statistical-inference, C. Chatfield
@Article{chatfield:model-uncertainty:95,
author = "C. Chatfield",
address = "Univ Bath, Sch Math Sci, Bath Ba2 7Ay, Avon, England",
title = "Model uncertainty, data mining and
statistical-inference",
journal = "J. Of The Royal Statistical Soc. Series A-Statistics
In Society",
year = "1995",
volume = "158",
issue = "Pt3",
pages = "419--466",
abstract = "This paper takes abroad, pragmatic view of statistical
inference to include all aspects of model formulation.
The estimation of model: parameters traditionally
assumes that a model has a prespecified known form and
takes no account of possible uncertainty regarding the
model structure. This implicitly assumes the existence
of a 'true' model, which many would regard-as a
fiction. In practice model uncertainty is a fact of
life and likely to be more serious than other sources
of uncertainty which have received far more attention
from statisticians. This is true whether the model is
specified on subject-matter grounds or, as is
increasingly the case, when a model is formulated,
fitted and checked on the same data set in an
iterative, interactive way. Modern computing power
allows a large number of models to be considered and
data-dependent specification searches have become the
norm in many areas of statistics. The term data mining
may be used in this context when the analyst goes to
great lengths to obtain a good fit. This paper reviews
the effects of model uncertainty, such as too narrow
prediction intervals, and the non-trivial biases in
parameter estimates which can follow data-based
modelling. Ways of assessing and overcoming the effects
of model uncertainty are discussed, including the use
of simulation and resampling methods, a Bayesian model
averaging approach and collecting additional data
wherever possible. Perhaps the main aim of the paper is
to ensure that statisticians are aware of the problems
and start addressing the issues even if there is no
simple, general theoretical fix.",
keywords = "MOVING AVERAGE MODELS, BOOTSTRAP, VALIDATION,
PREDICTION, COMPLEXITY, SELECTION, CHOICE,
AUTOREGRESSIVE MODEL, BAYESIAN MODEL AVERAGING, DATA
MINING, FORECASTING, MODEL BUILDING, RESAMPLING,
STATISTICAL INFERENCE, SUBSET SELECTION",
}
Large Scale Data Mining: Challenges and Responses,
@InProceedings{chattratichat.ea:large-scale:97,
title = "Large Scale Data Mining: Challenges and Responses",
author = "Jaturon Chattratichat and John Darlington and Moustafa
Ghanem and Harald H{\"{u}}ning Yike Guo and Martin
K{\"{o}}hler and Janjao Sutiwaraphun and Hing Wing To
and Dan Yang",
pages = "143",
crossref = "heckerman.ea:proceedings-third:97",
}
Bayesian Classification (AUTOCLASS): Theory and Results, P. Cheeseman and J. Stutz
@InCollection{cheeseman.ea:bayesian-classification:95,
author = "P. Cheeseman and J. Stutz",
title = "Bayesian Classification ({AUTOCLASS}): Theory and
Results",
booktitle = "Advances in Knowledge Discovery and Data Mining",
editor = "U. M. Fayyad and G. Piatetsky-Shapiro and P Smyth and
R. Uthurusamy",
year = "1995",
}
Efficient Data Mining for Path Traversal Patterns in Distributed Systems, M. S. Chen and J. S. Park and P. S. Yu
@InProceedings{chen.ea:efficient-path:96,
author = "M. S. Chen and J. S. Park and P. S. Yu",
title = "Efficient Data Mining for Path Traversal Patterns in
Distributed Systems",
booktitle = "16th International Conference on Distributed Computing
Systems (16th IDCS'96)",
pages = "385--393?",
publisher = "IEEE",
address = "Hong Kong",
month = may,
year = "1996",
keywords = "Distributed Objects,",
note = "IBM T. J. Watson Research Center, USA",
}
Data mining: an overview from a database perspective, Ming-Syan Chen and Jiawei Han and Philip S. Yu
@Article{chen.ea:overview-database:96,
author = "Ming-Syan Chen and Jiawei Han and Philip S. Yu",
address = "Natl Taiwan Univ, Dept Elect Engn, Taipei 10764,
Taiwan Simon Fraser Univ, Sch Comp Sci, Burnaby, Bc V5A
1S6, Canada Ibm Corp, Thomas J Watson Res Ctr, Yorktown
Hts, Ny, 10598",
title = "Data mining: an overview from a database perspective",
journal = "Ieee Trans. On Knowledge And Data Engineering",
year = "1996",
month = dec,
volume = "8",
issue = "6",
pages = "866--883",
abstract = "Mining information and knowledge from large databases
has been recognized by many researchers as a key
research topic in database systems and machine
learning, and by many industrial companies as an
important area with an opportunity of major revenues.
Researchers in many different fields have shown great
interest in data mining. Several emerging applications
in information providing services, such as data
warehousing and on-line services over the Internet,
also call for various data mining techniques to better
understand user behavior, to improve the service
provided, and to increase the business opportunities.
In response to such a demand, this article is to
provide a survey, from a database researcher's point of
view, on the data mining techniques developed recently.
A classification of the available data mining
techniques is provided, and a comparative study of such
techniques is presented.",
keywords = "data mining, knowledge discovery, association rules,
classification, data clustering, pattern matching
algorithms, data generalization and characterization,
data cubes, multiple-dimensional databases",
}
A parallel computing approach to creating engineering concept spaces for semantic retrieval - the illinois digital library initiative project, H. C. Chen and B. Schatz and T. Ng and J. Martinez and A. Kirchhoff and C. T. Lin
@Article{chen.ea:parallel-computing:96,
author = "H. C. Chen and B. Schatz and T. Ng and J. Martinez and
A. Kirchhoff and C. T. Lin",
address = "Univ Arizona, Karl Eller Grad Sch Management, Mis
Dept, Mcclelland Hall, Tucson, Az, 85721 Univ Illinois,
Natl Ctr Supercomp Applicat, Beckman Inst, Urbana, Il,
61801 Univ Arizona, Sci \& Engn Lib, Tucson, Az, 85712
Univ Arizona, Dept Lib \& Informat Studies, Tucson, Az,
85712",
title = "A parallel computing approach to creating engineering
concept spaces for semantic retrieval - the illinois
digital library initiative project",
journal = "Ieee Trans. On Pattern Analysis And Machine
Intelligence",
year = "1996",
volume = "18",
issue = "8",
pages = "771--782",
abstract = "This research presents preliminary results generated
from the semantic retrieval research component of the
illinois Digital Library Initiative (DLI) project.
Using a variation of the automatic thesaurus generation
techniques, to which we refer as the concept space
approach, we aimed to create graphs of domain-specific
concepts (terms) and their weighted co-occurrence
relationships for all major engineering domains.
Merging these concept spaces and providing traversal
paths across:different concept spaces could potentially
help alleviate the vocabulary (difference) problem
evident in large- scale information retrieval. We have
experimented previously with such a technique for a
smaller molecular biology domain (Worm Community
System, with 10+ MBs of document collection) with
encouraging results. In order to address the
scalability issue related to large-scale information
retrieval and analysis for the current Illinois DLI
project, we recently conducted experiments using the
concept space approach on parallel supercomputers. Our
test collection included 2+ GBs of computer science and
electrical engineering abstracts extracted from the
INSPEC database. The concept space approach called for
extensive textual and statistical analysis (a form of
knowledge discovery) based on automatic indexing and
cooccurrence analysis algorithms, both previously
tested in the biology domain. Initial testing results
using a 512-node CM-5 and a 16-processor SGI Power
Challenge were promising. Power Challenge was later
selected to create a comprehensive computer engineering
concept space of about 270,000 terms and 4,000,000+
links using 24.5 hours of CPU time. Our system
evaluation involving 12 knowledgeable subjects revealed
that the automatically-created computer engineering
concept space generated significantly higher concept
recall than the human- generated INSPEC computer
engineering thesaurus. However, the INSPEC was more
precise than the automatic concept space. Our current
work mainly involves creating concept spaces for other
major engineering domains and developing robust graph
matching and traversal algorithms for cross-domain,
concept-based retrieval. Future work also will include
generating individualized concept spaces for assisting
user- specific concept-based information retrieval.",
keywords = "INFORMATION-RETRIEVAL, DOCUMENT-RETRIEVAL, CONNECTION
MACHINE, NEURAL NETWORKS, SYSTEMS, SEARCH, PERFORMANCE,
DATABASES, DESIGN, MODEL, SEMANTIC RETRIEVAL, CONCEPT
SPACE, CONCEPT ASSOCIATION, PARALLEL COMPUTING, DIGITAL
LIBRARY",
}
Semantics-Based Information Management and Retrieval: A Knowledge Discovery Approach, H. Chen and K. Lynch
@Article{chen.ea:semantics-based-information:92,
author = "H. Chen and K. Lynch",
title = "Semantics-Based Information Management and Retrieval:
{A} Knowledge Discovery Approach",
journal = "IEEE Transactions on Systems, Man, and Cybernetics",
publisher = "IEEE",
month = "Forthcoming",
year = "1992",
abstract = "We report results of a study that involved the
creation of knowledge bases from large, operational
textual databases. Two East-bloc computing knowledge
bases, both based on semantic network structure, were
created automatically using two statistical algorithms.
With the help of four East-bloc computing experts, we
evaluated the two knowledge bases in detail in a
concept-association experiment bases on recall and
recognition tests. In our experiment, one of the
knowledge bases that exhibited the asymmetric link
property out-performed all four experts in recalling
relevant concepts in East-bloc computing. The knowledge
base, which contained about 20,000 concepts (nodes) and
280,000 weighted relationships (links), was
incorporated as a thesauras-like component into an
intelligent retrieval system. The system allowed users
to perform semantics-based information management and
information retrieval via interactive, conceptual
relevance feedback. Current research efforts include
development of a meta knowledge base and design of
semantic network and neural network based inferencing
algorithms.",
}
Growing Simpler Decision Trees to Facilitate Knowledge Discovery, Kevin J. Cherkauer and Jude W. Shavlik
@InProceedings{cherkauer.ea:growing-simpler:96,
title = "Growing Simpler Decision Trees to Facilitate Knowledge
Discovery",
pages = "315",
author = "Kevin J. Cherkauer and Jude W. Shavlik",
crossref = "simoudis.ea:proceedings-second:96",
}
Efficient mining of association rules in distributed databases, D. W. Cheung and V. T. Ng and A. W. Fu and Y. J. Fu
@Article{cheung.ea:efficient-association:96,
author = "D. W. Cheung and V. T. Ng and A. W. Fu and Y. J. Fu",
address = "Univ Hong Kong, Dept Comp Sci, Hong Kong, Hong Kong
Hong Kong Polytech Univ, Dept Comp, Hong Kong, Hong
Kong Chinese Univ Hong Kong, Dept Comp Sci \& Engn,
Hong Kong, Hong Kong Simon Fraser Univ, Sch Comp Sci,
Burnaby, Bc V5A 1S6, Canada",
title = "Efficient mining of association rules in distributed
databases",
journal = "Ieee Trans. On Knowledge And Data Engineering",
year = "1996",
month = dec,
volume = "8",
issue = "6",
pages = "911--922",
abstract = "Many sequential algorithms have been proposed for
mining of association rules. However, very little work
has been done in mining association rules in
distributed databases. A direct application of
sequential algorithms to distributed databases is not
effective, because it requires a large amount of
communication overhead. In this study, an efficient
algorithm, DMA, is proposed. It generates a small
number of candidate sets and requires only O(n)
messages for support count exchange for each candidate
set, where n is the number of sites in a distributed
database. The algorithm has been implemented on an
experimental test bed and its performance is studied.
The results show that DMA has superior performance when
comparing with the direct application of a popular
sequential algorithm in distributed databases.",
keywords = "data mining, knowledge discovery, distributed data
mining, association rule, distributed database,
distributed algorithm, partitioned database",
}
Maintenance of Discovered Knowledge: A Case in Multi-Level Association Rules, David W. Cheung and Vincent T. Ng and Benjamin W. Tam
@InProceedings{cheung.ea:maintenance-discovered:96,
title = "Maintenance of Discovered Knowledge: {A} Case in
Multi-Level Association Rules",
pages = "307",
author = "David W. Cheung and Vincent T. Ng and Benjamin W.
Tam",
crossref = "simoudis.ea:proceedings-second:96",
}
Knowledge discovery in databases: a rule-based attribute-oriented approach, D. W.-l. Cheung and A. W.-C. Fu and J. Han
@InProceedings{cheung.ea:rule-based-attribute-oriented:94a,
key_modifier = "a",
author = "D. W.-l. Cheung and A. W.-C. Fu and J. Han",
title = "Knowledge discovery in databases: a rule-based
attribute-oriented approach",
pages = "164--173",
editor = "Zbigniew W. Ra{\'s} and Maria Zemankova",
booktitle = "Proceedings of the 8th International Symposium on
Methodologies for Intelligent Systems",
month = oct,
series = "LNAI",
volume = "869",
publisher = "Springer",
address = "Berlin",
year = "1994",
}
Knowledge discovery in databases: a rule-based attribute-oriented approach, D. W.-I. Cheung and A. W.-C. Fu and J. Han
@Article{cheung.ea:rule-based-attribute-oriented:94b,
key_modifier = "b",
author = "D. W.-I. Cheung and A. W.-C. Fu and J. Han",
title = "Knowledge discovery in databases: a rule-based
attribute-oriented approach",
journal = "Lecture Notes in Computer Science",
volume = "869",
pages = "164--??",
year = "1994",
ISSN = "0302-9743",
}
Using Artificial Intelligence Planning to Automate Science Data Analysis for Large Image Databases, Steve Chien and Forest Fisher and and Helen Mortensen and Edisanter Lo and Ronald Greeley
@InProceedings{chien.ea:using-artificial:97,
title = "Using Artificial Intelligence Planning to Automate
Science Data Analysis for Large Image Databases",
author = "Steve Chien and Forest Fisher and and Helen Mortensen
and Edisanter Lo and Ronald Greeley",
pages = "147",
crossref = "heckerman.ea:proceedings-third:97",
}
A framework for query optimization to support data mining, R. Sunil Choenni and Arno P. J. M. Siebes
Available as
compressed postscript.
@InCollection{choenni.ea:framework-query:96,
author = "R. Sunil Choenni and Arno P. J. M. Siebes",
title = "A framework for query optimization to support data
mining",
publisher = "Centrum voor Wiskunde en Informatica (CWI)",
ISSN = "ISSN 0169-118X",
month = oct # " 31",
year = "1996",
keywords = "data mining systems, search strategies, query
optimization, physical database design.",
URL = "ftp://ftp.cwi.nl/pub/CWIreports/AA/CS-R9637.ps.Z",
abstract = "In order to extract knowledge from databases, data
mining algorithms heavily query the databases.
Inefficient processing of these queries will inevitably
have its impact on the performance of these algorithms,
making them less valuable. In this paper, we describe
an optimization framework for an efficient processing
of queries generated by different data mining
algorithms. In this framework, we show how to take
advantage of the physical organization of the database,
the operators and the control structures used in an
algorithm. Finally, we discuss how our framework fits
into conventional query optimization frameworks.",
note = "AA (Department of Algorithmics and Architecture)",
annote = "originally contained the following fields and values -
booktitle, 105 note, CS-R9637",
}
On multi-query optimization, R. (Sunil) Choenni and Martin L. Kersten and Johan F. P. van den Akker and Amani Saad
Available as
compressed postscript.
@InCollection{choenni.ea:on-multi-query:96,
author = "R. (Sunil) Choenni and Martin L. Kersten and Johan F.
P. van den Akker and Amani Saad",
title = "On multi-query optimization",
pages = "19",
publisher = "Centrum voor Wiskunde en Informatica (CWI)",
address = "ISSN 0169-118X",
month = oct # " 31",
year = "1996",
keywords = "multi-query optimization, architectures, exploiting
interdependencies between queries.",
URL = "ftp://ftp.cwi.nl/pub/CWIreports/AA/CS-R9638.ps.Z",
abstract = "In some key database applications, such as data
mining, a sequence of interdependent queries may be
posed simultaneously to the DBMS. The optimization of
such sequences is called multi-query optimization, and
it attempts to exploit these dependencies in the
derivation of a query evaluation plan (qep). Although
it has been observed and demonstrated by several
researchers that exploitation of dependencies speed up
the query processing, limited research has been
reported how to benefit from multi-query optimization,
taking the capabilities of existing query optimizers
into account. This is exactly the topic of this paper.
Since existing optimizers are able to optimize queries
in which a restricted number of basic operations
appears, e.g., number of joins is limited to 10, and
the optimization of a query is relatively expensive, we
attempt to profit from multi query optimization under
the condition that queries are passed only once and
separately to the optimizer. We propose a two-step
optimization procedure. In the first step, we
determine, on the basis of the dependencies between
queries, in which order they should be specified and
what results should be stored. In the second step, each
query is passed separately to an optimizer.",
note = "AA (Department of Algorithmics and Architecture)",
annote = "originally contained the following fields and values -
note, CS-R9638, booktitle, 143",
}
Using a Hybrid Neural/Expert System for Data Base Mining in Market Survey Data, Victor Ciesielski and Gregory Palstra
@InProceedings{ciesielski.ea:using-hybrid:96,
title = "Using a Hybrid Neural/Expert System for Data Base
Mining in Market Survey Data",
pages = "38",
author = "Victor Ciesielski and Gregory Palstra",
crossref = "simoudis.ea:proceedings-second:96",
}
Classification Problem Solving, W. J. Clancey
@InProceedings{clancey:classification-problem:84,
title = "Classification Problem Solving",
author = "W. J. Clancey",
editor = "R. J. Brachman",
booktitle = "Proceedings of the National Conference on Artificial
Intelligence",
address = "Austin, Texas",
month = aug,
year = "1984",
publisher = "William Kaufmann",
pages = "49--55",
}
The CN2 Induction Algorithm, Peter Clark and Tim Niblett
@Article{clark.ea:cn2-induction:89,
author = "Peter Clark and Tim Niblett",
title = "The {CN2} Induction Algorithm",
journal = "Machine Learning",
year = "1989",
volume = "3",
pages = "261--283",
}
Knowledge Representation in Machine Learning, Peter Clark
@InCollection{clark:representation-machine:89,
author = "Peter Clark",
title = "Knowledge Representation in Machine Learning",
editor = "Yves Kodratoff and Alan Hutchinson",
booktitle = "Machine and Human Learning, advances in European
Research",
publisher = "Michael Horwood",
year = "1989",
pages = "35--49",
address = "London",
}
Security and Privacy Implications of Data Mining, Chris Clifton and Don Marks
Available as
postscript.
@InProceedings{clifton.ea:security-privacy:96,
author = "Chris Clifton and Don Marks",
title = "Security and Privacy Implications of Data Mining",
booktitle = "Workshop on Data Mining and Knowledge Discovery",
address = "Montreal, Canada",
organization = "ACM SIGMOD",
year = "1996",
publisher = "University of British Columbia Department of Computer
Science",
number = "96-08",
pages = "15--19",
month = jun # " 2",
URL = "ftp://ftp.fas.sfu.ca/pub/cs/han/dmkd96/p15.ps",
contributedby = "clifton(at)mitre.org",
}
Overfitting Explained, P. R. Cohen and D. Jensen
Available as
postscript.
@InProceedings{cohen.ea:overfitting-explained:97,
author = "P. R. Cohen and D. Jensen",
title = "Overfitting Explained",
booktitle = "Preliminary Papers of the Sixth International Workshop
on Artificial Intelligence and Statistics",
year = "1997",
month = jan,
pages = "115--122",
abstract = "Overfitting arises when model components are evaluated
against the wrong reference distribution. Most modeling
algorithms iteratively find the best of several
components and then test whether this component is good
enough to add to the model. We show that for
independently distributed random variables, the
reference distribution for any one variable
underestimates the reference distribution for the the
highest-valued variable; thus variate values will
appear significant when they are not, and model
components will be added when they should not be added.
We relate this problem to the well-known statistical
theory of multiple comparisons or simultaneous
inference.",
abstract_url = "http://eksl-www.cs.umass.edu/~jensen/papers/ais97b.html",
URL = "http://www-eksl.cs.umass.edu/papers/cohen-ais96b.ps",
}
The Role of Knowledge Mining in the Development and Evolution of New Applications, David Cohen and L. Berke and P. Bloom and D. Cohen and D. Tsur
@InProceedings{cohen.ea:role-development:94,
author = "David Cohen and L. Berke and P. Bloom and D. Cohen and
D. Tsur",
title = "The Role of Knowledge Mining in the Development and
Evolution of New Applications",
pages = "166--167",
editor = "Ahmed K. Elmagarmid and Erich Neuhold",
booktitle = "Proceedings of the 10th International Conference on
Data Engineering",
address = "Houston, TX",
month = feb,
year = "1994",
publisher = "IEEE Computer Society Press",
}
Knowledge in context: a strategy for expert system maintenance, P. Compton and R. Jansen
@InProceedings{compton.ea:context-strategy:88,
author = "P. Compton and R. Jansen",
title = "Knowledge in context: a strategy for expert system
maintenance",
booktitle = "Proceedings of the 2nd {A}ustralian Joint Artificial
Intelligence conference",
address = "Adelaide",
year = "1988",
publisher = "Springer",
series = "Lecture Notes in Artificial Intelligence",
volume = "406",
pages = "292--306",
}
Knowledge discovery in molecular databases, D. Conklin and S. Fortier and J. Glasgow
@Article{conklin.ea:molecular:93,
author = "D. Conklin and S. Fortier and J. Glasgow",
address = "Queens Univ, Dept Comp \& Informat Sci, Kingston K7L
3N6, On, Canada Queens Univ, Dept Chem, Kingston K7L
3N6, On, Canada",
title = "Knowledge discovery in molecular databases",
journal = "Ieee Trans. On Knowledge And Data Engineering",
year = "1993",
volume = "5",
issue = "6",
pages = "985--987",
abstract = "This paper describes an approach to knowledge
discovery in complex molecular databases. The machine
learning paradigm used is structured concept formation,
in which objects described in terms of components and
their interrelationships are clustered and organized in
a knowledge base. Symbolic images are used to represent
classes of structured objects. A discovered molecular
knowledge base is successfully used in the
interpretation of a high resolution electron density
map.",
keywords = "PROTEIN, CASE-BASED REASONING, CHEMICAL INFORMATION
RETRIEVAL, CONCEPTUAL CLUSTERING, DESCRIPTION LOGICS,
INDEXING, RELATIONAL MODELS, SCENE ANALYSIS, SPATIAL
CONCEPTS, SPATIAL REASONING, STRUCTURED CONCEPT
FORMATION",
}
Machine discovery of protein motifs, D. Conklin
@Article{conklin:machine-protein:95,
author = "D. Conklin",
address = "Zymogenet Inc, 1201 Eastlake Ave E, Seattle, Wa,
98102",
title = "Machine discovery of protein motifs",
journal = "Machine Learning",
year = "1995",
volume = "21",
issue = "1-2",
pages = "125--150",
abstract = "The investigation of relations between protein
tertiary structure and amino acid sequence is a topic
of tremendous importance in molecular biology. The
automated discovery of recurrent patterns of structure
and sequence is an essential part of this
investigation. These patterns, known as protein motifs,
are abstractions of fragments drawn from proteins of
known sequence and tertiary structure. This paper has
two objectives. The first is to introduce and define
protein motifs, and provide a survey of previous
research on protein motif discovery. The second is to
present and apply a novel approach to protein motif
representation and discovery, which is based on a
spatial description logic and the symbolic machine
learning paradigm of structured concept formation. A
large database of protein fragments is processed using
this approach, and several interesting and significant
protein motifs are discovered.",
keywords = "SECONDARY STRUCTURE, SEQUENCE PATTERNS, PREDICTIVE
POWER, IDENTIFICATION, RECOGNITION, GENERATION,
DEFINITION, TEMPLATES, SETS, PROTEIN TERTIARY
STRUCTURE, MACHINE DISCOVERY, RELATIONAL LEARNING,
KNOWLEDGE REPRESENTATION, DESCRIPTION LOGICS,
INFORMATION RETRIEVAL, KNOWLEDGE DISCOVERY IN
DATABASES",
}
Scalable discovery of informative structural concepts using domain knowledge, D. J. Cook and L. B. Holder and S. Djoko
@Article{cook.ea:scalable-informative:96,
author = "D. J. Cook and L. B. Holder and S. Djoko",
address = "Univ Texas, Dept Comp Sci \& Engn, Arlington, Tx,
76019 Bell No Res, Sci Staff, Richardson, Tx",
title = "Scalable discovery of informative structural concepts
using domain knowledge",
journal = "Ieee Expert-Intelligent Systems \& Their
Applications",
year = "1996",
volume = "11",
issue = "5",
pages = "59--68",
}
Substructure Discovery Using Minimum Description Length and Background Knowledge, D. J. Cook and L. B. Holder
Available as
postscript.
@Article{cook.ea:substructure-using:94,
author = "D. J. Cook and L. B. Holder",
title = "Substructure Discovery Using Minimum Description
Length and Background Knowledge",
journal = "JAIR",
year = "1994",
volume = "1",
pages = "231--255",
abstract = "The ability to identify interesting and repetitive
substructures is an essential component to discovering
knowledge in structural data. We describe a new version
of our SUBDUE substructure discovery system based on
the minimum description length principle. The SUBDUE
system discovers substructures that compress the
original data and represent structural concepts in the
data. By replacing previously-discovered substructures
in the data, multiple passes of SUBDUE produce a
hierarchical description of the structural regularities
in the data. SUBDUE uses a computationally-bounded
inexact graph match that identifies similar, but not
identical, instances of a substructure and finds an
approximate measure of closeness of two substructures
when under computational constraints. In addition to
the minimum description length principle, other
background knowledge can be used by SUBDUE to guide the
search towards more appropriate substructures.
Experiments in a variety of domains demonstrate
SUBDUE's ability to find substructures capable of
compressing the original data and to discover
structural concepts important to the domain.",
annote = "The SUBDUE system discovers substructures that
compress the original data and represent structural
concepts in the data. By replacing
previously-discovered substructures in the data,
multiple passes of SUBDUE produce a hierarchical
description of the structural regularities in the
data.",
URL = "gopher://P.GP.CS.CMU.EDU:70/00/volume1/cook94a.ps",
}
What has Mill to Say About Data Mining ?, Tremaine A. O. Cornish and Anthony D. Elliman
@InProceedings{cornish.ea:what-has:95,
author = "Tremaine A. O. Cornish and Anthony D. Elliman",
title = "What has Mill to Say About Data Mining ?",
pages = "347--353",
booktitle = "Proceedings of the Eleventh Conference on Artificial
Intelligence for Applications",
month = "20--2~" # feb,
publisher = "IEEE Computer Society Press",
address = "Los Alamitos",
year = "1995",
}
Historical perspectives on information-science, T. A. O. Cornish
@Article{cornish:historical-perspectives:96,
author = "T. A. O. Cornish",
address = "Brunel Univ, Dept Comp Sci \& Informat Syst, Uxbridge
Ub8 3Ph, Middx, England",
title = "Historical perspectives on information-science",
journal = "Systems Research And Information Science",
year = "1996",
volume = "7",
issue = "2",
pages = "105--116",
abstract = "There is a general attitude in science and
particularly computer science, that if something is
more than five year old, then we have nothing to learn
from it. This paper seeks first to destroy the basis of
this myth with reference to areas of current research
which are still striving to live up to visions set many
years ago. Secondly to look at an area of research,
Knowledge Discovery in Databases and demonstrate that
it to has a great deal to learn from the distant past,
which has been all but overlooked.",
keywords = "KNOWLEDGE DISCOVERY, SYSTEMATIC, SCIENTIFIC, DATA
MINING, HISTORICAL, INFORMATION, SYSTEMS",
}
Data Mining of Multi-dimensional Remotely Sensed Images, Robert F. Cromp and William J. Campbell
@InProceedings{cromp.ea:multi-dimensional-remotely:93,
author = "Robert F. Cromp and William J. Campbell",
title = "Data Mining of Multi-dimensional Remotely Sensed
Images",
pages = "471--480",
editor = "Bharat Bhargava and Timothy Finin and Yelena Yesha",
booktitle = "Proceedings of the 2nd International Conference on
Information and Knowledge Management",
month = nov,
publisher = "ACM Press",
address = "New York, NY, USA",
year = "1993",
}
Knowledge Discovery in Databases: Exploiting Knowledge-Level Redescription, J. Cupit and N. Shadbolt
@Article{cupit.ea:exploiting-knowledge-level:96a,
key_modifier = "a",
author = "J. Cupit and N. Shadbolt",
title = "Knowledge Discovery in Databases: Exploiting
Knowledge-Level Redescription",
journal = "Lecture Notes in Computer Science",
volume = "1076",
pages = "245--??",
year = "1996",
ISSN = "0302-9743",
}
Knowledge Discovery in Databases: Exploiting Knowledge-Level Redescription, James Cupit and Nigel Shadbolt
@InProceedings{cupit.ea:exploiting-knowledge-level:96b,
key_modifier = "b",
author = "James Cupit and Nigel Shadbolt",
title = "Knowledge Discovery in Databases: Exploiting
Knowledge-Level Redescription",
pages = "245--261",
editor = "Nigel Shadbolt and Kieron O'Hara and Schreiber Guus",
booktitle = "Proceedings of the Nineth European Knowledge
Acquisition Workshop ({EKAW}-96)",
month = may # "14--17~",
series = "LNAI",
volume = "1076",
publisher = "Springer",
address = "Berlin",
year = "1996",
}
Mining Knowledge in Noisy Audio Data, Andrzej Czyzewski
@InProceedings{czyzewski:noisy-audio:96,
title = "Mining Knowledge in Noisy Audio Data",
pages = "220",
author = "Andrzej Czyzewski",
crossref = "simoudis.ea:proceedings-second:96",
}
Distributed learning: An agent-based approach to data-mining, Winton Davies and Peter Edwards
@InProceedings{davies.ea:distributed-learning:95,
title = "Distributed learning: {A}n agent-based approach to
data-mining",
author = "Winton Davies and Peter Edwards",
booktitle = "Working