% Data Mining Bibliographies Copyright Information % % The author reserves the % % Copyright (C) 1997 Andy Pryke. All rights reserved. % % for the compilation of this KDD bibliography collection. % % If you find the bibliography collection useful for your work, I would % be happy if you acknowledge it and me. You could also send me a % postcard if you wish (address below). % % I usually give my consent that the collection may be copied and % distributed with the following conditions: % % 1) It may be used only for research or educational purposes % % and % % 2) Any copy must be accompanied by a reference to the original % collection and its author. % % and % % 3) This information must always accompany every copy of a bibliograhy. % % I reserve the right to revoke the above permission at any time. % % Any other use must be negotiated in advance. % % Any commercial use of the bibliographies is strictly prohibited. In % particular, the whole or derived bibliographies may not be sold for % profit or included in commercial documents (e.g., published on CD-ROM, % floppy disks, books, magazines, or other print form) without the prior % written permission of the copyright holder. % % Please contact the author if the intended usage is not covered by the % above statement. % % Abstracts of publications published by the ACM and the IEEE are also % subject to the respective "interim" or "provisional" copyright % policies: % % ACM copyright policy (http://www.acm.org/pubs/copyright_policy/) % IEEE copyright policy (http://www.ieee.org/copyright/policies.htm) % % This copyright notice is derived from one by Alf-Christian Achilles % for his (massive) Computer Science Bibliography Collection at % (http://liinwww.ira.uka.de/bibliography/index.html). % % -------------------------------------------------------------------- % % My address: % % My postal address is: % % Andy Pryke, % Department of Computer Science, % The University of Birmingham, % Edgbaston, % Birmingham. % B15 2TT % % Fax : 0121 414 4281 % Phone: 0121 414 3736 % Email: A.N.Pryke(at)cs.bham.ac.uk % Web: http://www.cs.bham.ac.uk/~anp/ % @Article{machine_learning_journal_special:93, key = "Machine_Learning_Journal_Special:93", journal = "Machine Learning Journal", year = "1993", volume = "5", number = "6", month = dec, note = "Special issue on Learning and Discovery in Databases", } @TechReport{no_author:improved-methods:, URL = "ftp://ftp.cs.helsinki.fi/pub/Reports/by_Project/PMDM/Improved_Methods_for_Finding_Association_Rules.ps.gz", title = "Improved Methods for Finding Association Rules", abstract = "Association rules are statements of the form for 90 % of the rows of the relation, if the row has value 1 in the columns in set W , then it has 1 also in column B . Agrawal, Imielinski, and Swami introduced the problem of mining association rules from large collections of data, and gave a method based on successive passes over the database. We give an improved algorithm for the problem. The method is based on careful combinatorial analysis of the information obtained in previous passes; this makes it possible to eliminate unnecessary candidate rules. Experiments on a university course enrollment database indicate that the method outperforms the previous one by a factor of 5. We also give simple information-theoretic lower bounds for the problem of finding association rules, and show that sampling is in general a very efficient way of finding such rules. Computing Reviews Categories and Subject Descriptors: H.3.3[Information Systems]: Information Storage and Retrieval - Information Search and Retrieval I.2.6 [Computing Methodologies]: Artificial Intelligence - Learning I.2.8 [Computing Methodologies]: Artificial Intelligence - Problem Solving, Control Methods, and Search General Terms: Databases, machine learning, artificial intelligence. Additional Key Words and Phrases: Database mining, knowledge discovery in databases, association rules, covering sets.", } @TechReport{no_author:learning-decision:, URL = "ftp://ftp.cs.helsinki.fi/pub/Reports/by_Project/PMDM/Learning_Decision_Trees_for_Mapping_the_Local_Environment_in_Mobile_Robot_Navigation.ps.gz", title = "Learning Decision Trees for Mapping the Local Environment in Mobile Robot Navigation", abstract = "This paper describes the use of the C4.5 decision tree learning algorithm in the design of a classifier for a new approach to the mapping of a mobile robot's local environment. The decision tree uses the features from the echoes of an ultrasonic array mounted on the robot to classify the contours of its local environment. The contours are classified into a finite number of two dimensional shapes to form a primitive map which is to be used for navigation. The nature of the problem, noise and the practical timing constraints, distinguishes it from those typically used in machine learning applications and highlights some of the advantages of decision tree learning in robotic applications.", } @Misc{no_author:overheads-ai94:, URL = "ftp://coral.cs.jcu.edu.au/pub/research/HCV/KDD.ps", title = "Overheads for the {AI}'94 Tutorial on Intelligent Learning Database Systems", abstract = "This full-day tutorial presents and discusses techniques for the following 3 interconnected phases in constructing intelligent learning database systems: (1) Translation of standard database information into a form suitable for use by a rule-based system; (2) Using machine learning techniques to produce rule bases from databases; and (3) Interpreting the rules produced to solve users' problems and/or reduce data spaces. It suits a wide audience including postgraduate students and industrial people from databases, expert systems, and machine learning.", annote = "Comments and suggestions for improvements are solicited! Comments to Xindong Wu (xindong(at)INSECT.SD.MONASH.EDU.AU),", } @Article{no_author:state-art:95, title = "State Of The Art", journal = "Byte", year = "1995", month = oct, annote = "A number of articles, good introduction to data mining", URL = "http://www.byte.com/art/9510/sec8/sec8.htm", } @InProceedings{adomavicius.ea:actionable-patterns:97, title = "Discovery of Actionable Patterns in Databases: The Action Hierarchy Approach", author = "Gediminas Adomavicius and Alexander Tuzhilin", pages = "111", crossref = "heckerman.ea:proceedings-third:97", } @InProceedings{agrawal.ea:association-rules:93a, key_modifier = "a", title = "Mining Association Rules between Sets of Items in Large Databases", author = "Rakesh Agrawal and Tomasz Imielinski and Arun N. Swami", editor = "Peter Buneman and Sushil Jajodia", booktitle = "Proceedings of the 1993 {ACM} {SIGMOD} International Conference on Management of Data", address = "Washington, D.C.", month = "26--28~" # may, year = "1993", pages = "207--216", URL = "http://www.almaden.ibm.com/cs/people/ragrawal/papers/sigmod93.ps", abstract = "We are given a large database of customer transactions. Each transaction consists of items purchased by a customer in a visit. We present an efficient algorithm that generates all significant association rules between items in the database. The algorithm incorporates buffer management and novel estimation and pruning techniques. We also present results of applying this algorithm to sales data obtained from a large retailing company, which shows the effectiveness of the algorithm.", } @Article{agrawal.ea:association-rules:93b, key_modifier = "b", author = "Rakesh Agrawal and Tomasz Imielinski and Arun Swami", title = "Mining association rules between sets of items in large databases", journal = "SIGMOD Record (ACM Special Interest Group on Management of Data)", volume = "22", number = "2", pages = "207--216", month = jun, year = "1993", ISBN = "0-89791-592-5", ISSN = "0163-5808", abstract = "We are given a large database of customer transactions. Each transaction consists of items purchased by a customer in a visit. We present an efficient algorithm that generates all significant association rules between items in the database. The algorithm incorporates buffer management and novel estimation and pruning techniques. We also present results of applying this algorithm to sales data obtained from a large retailing company, which shows the effectiveness of the algorithm.", affiliation = "IBM Almaden Research Cent", affiliationaddress = "San Jose, CA, USA", classification = "723.3; 921.6; 911.4; 723.2; 722.1; 922.1; C6160Z (Other DBMS); C6130 (Data handling techniques); C6170 (Expert systems); C6120 (File organisation); C7170 (Marketing)", conference = "Proceedings of the 1993 ACM SIGMOD International Conference on Management of Data", conferenceyear = "1993", keywords = "Database systems; Algorithms; Marketing; Data handling; Data storage equipment; Probability; Estimation; Query languages; Large scale systems; Associative processing; Administrative data processing; Large databases; Mining association rules; Pruning technique; Basket data, Large database; Customer transactions; Efficient algorithm; Association rules; Buffer management; Novel estimation; Pruning techniques; Sales data; Large retailing company", meetingaddress = "Washington, DC, USA", meetingdate = "May 26--28 1993", meetingdate2 = "05/26--28/93", publisherinfo = "Fort Collins Computer Center", sponsor = "ACM, SIGMOD; Minerals, Metals \& Materials Society", thesaurus = "Knowledge based systems; Marketing data processing; Storage management; Transaction processing; Very large databases", xxcrossref = "Anonymous:1993:SAS", } @Article{agrawal.ea:database-performance:93, author = "R. Agrawal and T. Imielinski and A. Swami", address = "Ibm Corp, Almaden Res Ctr, 650 Harry Rd, San Jose, Ca, 95120", title = "Database mining - a performance perspective", journal = "Ieee Trans. On Knowledge And Data Engineering", year = "1993", volume = "5", issue = "6", pages = "914--925", abstract = "We present our perspective of database mining as the confluence of machine learning techniques and the performance emphasis of database technology. We describe three classes of database mining problems involving classification, associations, and sequences, and argue that these problems can be uniformly viewed as requiring discovery of rules embedded in massive data. We describe a model and some basic operations for the process of rule discovery. We show how the database mining problems we consider map to this model and how they can be solved by using the basic operations we propose. We give an example of an algorithm for classification obtained by combining the basic rule discovery operations. This algorithm not only is efficient in discovering classification rules but also has accuracy comparable to ID3, one of the current best classifiers.", annote = "Identification and unification of 3 classes of data mining problem, Classification, Association and Sequences. They then go on to propose a unifying framework for these three problems, and five basic operators for rule discovery. These are then used to construct an algorithm CDP (Classifier with Dynamic Pruning) which out performs ID3 in classifier accuracy and efficiency on a test problem.", keywords = "ASSOCIATIONS, CLASSIFICATION, DATABASE MINING, DECISION TREES, KNOWLEDGE DISCOVERY, SEQUENCES", } @InProceedings{agrawal.ea:developing-tightly-coupled:96, title = "Developing Tightly-Coupled Data Mining Applications on a Relational Database System", pages = "287", author = "Rakesh Agrawal and Kyuseok Shim", crossref = "simoudis.ea:proceedings-second:96", } @InProceedings{agrawal.ea:fast-algorithms:94, author = "R. Agrawal and R. Srikant", title = "Fast Algorithms for Mining Association Rules in Large Databases", editor = "Jorgeesh Bocca and Matthias Jarke and Carlo Zaniolo", booktitle = "20th International Conference on Very Large Data Bases, September 12--15, 1994, Santiago, Chile proceedings", publisher = "Morgan Kaufmann Publishers", address = "Los Altos, CA 94022, USA", pages = "487--499", year = "1994", annote = "Also known as VLDB'94", keywords = "very large data bases; VLDB", } @Article{agrawal.ea:parallel-association:96, author = "R. Agrawal and J. C. Shafer", address = "Ibm Corp, Almaden Res Ctr, 650 Harry Rd, San Jose, Ca, 95120", title = "Parallel mining of association rules", journal = "Ieee Trans. On Knowledge And Data Engineering", year = "1996", volume = "8", issue = "6", pages = "962--969", abstract = "We consider the problem of mining association rules on a shared- nothing multiprocessor. We present three algorithms that explore a spectrum of trade-offs between computation, communication, memory usage, synchronization, and the use of problem-specific information. The best algorithm exhibits near perfect scaleup behavior, yet requires only minimal overhead compared to the current best serial algorithm.", keywords = "data mining, association rules, parallel algorithms", } @Article{agrawal.ea:quest-project:94a, key_modifier = "a", author = "R. Agrawal and M. Carey and C. Faloutson and S. Ghosh and A. Houtsma and T. Imielinski and B. Iyer and A. Mahboob and H. Miranda and R. Srikant and A. Swami", title = "{Quest}: {A} Project on Database Mining", journal = "SIGMOD Record (ACM Special Interest Group on Management of Data)", volume = "23", number = "2", pages = "514--514", month = jun, year = "1994", ISSN = "0163-5808", affiliation = "IBM Almaden Res. Center, San Jose, CA, USA", classification = "C6160 (Database management systems (DBMS))", keywords = "Quest project; Database mining; Tertiary storage; Data model construction; Data model verification", thesaurus = "Very large databases", xxcrossref = "Anonymous:1994:ASI", } @InProceedings{agrawal.ea:quest-project:94b, key_modifier = "b", title = "Quest: {A} Project on Database Mining", author = "Rakesh Agrawal and Michael J. Carey and Christos Faloutsos and Sakti P. Ghosh and Maurice A. W. Houtsma and Tomasz Imielinski and Balakrishna R. Iyer and A. Mahboob and H. Miranda and Ramakrishnan Srikant and Arun N. Swami", editor = "Richard T. Snodgrass and Marianne Winslett", booktitle = "Proceedings of the 1994 {ACM} {SIGMOD} International Conference on Management of Data", address = "Minneapolis, Minnesota", month = "24--27~" # may, year = "1994", pages = "514", } @InProceedings{agrawal.ea:quest-system:96, title = "The Quest Data Mining System", pages = "244", author = "Rakesh Agrawal and Manish Mehta and John Shafer and Ramakrishnan Srikant and Andreas Arning and Toni Bollinger", crossref = "simoudis.ea:proceedings-second:96", } @InProceedings{agrawal.ea:sequential-patterns:95, author = "R. Agrawal and R. Srikant", title = "Mining Sequential Patterns", booktitle = "International Conference on Database Engineering", organization = "ieee", year = "1995", pages = "3--14", abstract = "We are given a large database of customer transactions, where each transaction consists of customer-id, transaction time, and the items bought in the transaction. We introduce the problem of mining sequential patterns over such databases. We present three algorithms to solve this problem, and empirically evaluate their performance using synthetic data. Two of the proposed algorithms, AprioriSome and AprioriAll, have comparable performance, albeit AprioriSome performs a little better when the minimum number of customers that must support a sequential pattern is low. Scale-up experiments show that both AprioriSome and AprioriAII scale linearly with the number of customer transactions. They also have excellent scale-up properties with respect to the number of transactions per customer and the number of items in a transaction.", } @InProceedings{agrawal:data-mining:94, author = "Rakesh Agrawal", title = "Data Mining", pages = "75--76", booktitle = "Proceedings of the 13th Symposium on Principles of Database Systems", month = may, publisher = "ACM Press", address = "New York, NY, USA", year = "1994", } @InProceedings{agrawal:tutorial:94, author = "R. Agrawal", title = "Tutorial: Data Mining", editor = "{ACM}", booktitle = "13th Symposium --- 1994 May: Minneapolis; {MN}", volume = "13", publisher = "ACM Press", address = "New York, NY 10036, USA", series = "PROCEEDINGS OF THE ACM SIGACT SIGMOD SIGART SYMPOSIUM ON PRINCIPLES OF DATABASE SYSTEMS 1994", pages = "75--76", year = "1994", keywords = "database systems; ACM; SIGACT; SIGMOD; SIGART; computability; theory", } @Misc{aha:machine-learning:, URL = "http://www.aic.nrl.navy.mil/~aha/slides.html", title = "Machine Learning tutorial (Slides and Anotated Bibliography)", author = "David Aha", annote = "David Aha presented the Machine Learning tutorial at AI \& Stats 1995. He's kindly put his slides online", } @TechReport{al-naemi:temporal-aspects:92, author = "Salem Al-naemi", title = "Temporal aspects in data mining", institution = "Computer Science Department, University of Birmingham", year = "1992/3", annote = "Sections on RdB's, other temporal models and time series", } @Article{alexander:mine-gold:94, author = "Michael Alexander", title = "Mine for Gold with Parallel Systems", journal = "Datamation", volume = "40", number = "22", pages = "65--??", day = "15", month = nov, year = "1994", ISSN = "0011-6963", abstract = "Parallel computing technology has become more accessible to IS shops with the release of parallelized versions of popular RDBMSs. With such off-the-shelf tools, your company can gain competitive advantage through techniques like data mining that allow you to more finely analyze and project demand for your products. But if you're going to need the power of massively parallel systems, off-the-shelf solutions are still a few years away.", } @InProceedings{ali.ea:partial-classification:97, title = "Partial Classification Using Association Rules", author = "Kamal Ali and Stefanos Manganaris and Ramakrishnan Srikant", pages = "115", crossref = "heckerman.ea:proceedings-third:97", } @Article{allen:charter:95, author = "Robert B. Allen", title = "Charter", journal = "ACM Transactions on Information Systems", volume = "13", number = "3", pages = "235", year = "1995", copyright = "(c) Copyright 1995 Association for Computing Machinery", abstract = "The ACM Transactions on Information Systems (TOIS) considers the design, performance, and evaluation of computer systems that facilitate the presentation of information in a variety of media, as well as underlying technologies that support these systems. The major themes of TOIS and those topics which distinguish it from other ACM Transactions include: - Information Retrieval and Information Filtering: Algorithms and inference mechanisms for search, retrieval, and presentation of information and models of user information preferences. - Information Interfaces: Hypertext and hypermedia interfaces, information visualization, multimedia presentation, and task and user models for information systems. - Natural Language Processing: Computational linguistics and models of natural language (including content, syntax, semantics, and dialogue) relevant to information systems. - Knowledge and Information Representation: Representation issues for supporting information systems including semantic and object-oriented databases, knowledge bases, and hypertext/hypermedia document models. - Multimedia Information Systems: Semantics, search, and presentation of media including audio, image, video, and virtual reality. - Networked Information Systems: Interfaces and indexing, resource discovery, and visualization. - Organizational Interfaces and Social Impact of Information Systems: Electronic mail; decision and negotiation support systems; the effects of information system use on groups, organizations, and communities; social constraints imposed on information systems such as legal and privacy concerns. - Design and Evaluation of Information Systems: Design principles for information systems, methodologies for evaluating information systems, and programming languages relevant to information systems. - Information System Applications: Electronic books, documents, journals, movies, and libraries; authoring systems; office information systems; geographic information systems; and intelligent tutoring systems.", } @Article{alnahi.ea:biomedical-machine:93, author = "H. Alnahi and S. Alshawi", address = "Brunel Univ, Dept Comp Sci, Uxbridge Ub8 3Ph, Middx, England", title = "Knowledge discovery in biomedical databases - a machine induction approach", journal = "Computer Methods And Programs In Biomedicine", year = "1993", volume = "39", issue = "3-4", pages = "343--349", abstract = "The increase in the number and size of available databases by far exceeds the growth of the corresponding knowledge. Furthermore, many databases contain information which is not possessed by an existing human expert. This creates both a need and an opportunity for extracting knowledge from databases. An unsolved problem in molecular biology is the problem of predicting a protein's secondary structure from its primary structure. Inductive machine learning is a search for a plausible general description which can explain the given input data, and is useful for predicting new data. In this paper we present a statistical inductive algorithm which can be used to produce new rules for predicting multiple protein secondary structures from protein primary structure databases.", keywords = "SECONDARY STRUCTURE, PREDICTION, SEQUENCE, MACHINE LEARNING, INDUCTION, DATABASES, KNOWLEDGE, RULES, PROTEIN PRIMARY SECONDARY STRUCTURES, AMINO ACID RESIDUES", } @Article{an.ea:discovering-rules:96, author = "A. J. An and N. Shan and C. Chan and N. Cercone and W. Ziarko", address = "Univ Regina, Dept Comp Sci, Regina, Sk S4S 0A2, Canada", title = "Discovering rules for water demand prediction: an enhanced rough-set approach (reprinted from proceedings of the international joint conference on artificial intelligence)", journal = "Engineering Applications Of Artificial Intelligence", year = "1996", volume = "9", issue = "6", pages = "645--653", abstract = "Prediction of consumer demands is a pre-requisite for optimal control of water distribution systems because minimum-cost pumping schedules can be computed if water demands are accurately estimated This paper presents an enhanced rough-sets method for generating prediction rules from a set of observed data. The proposed method extends upon the standard rough set model by making use of the statistical information inherent in the data to handle incomplete and ambiguous training samples. It also discusses some experimental results from using this method for discovering knowledge on water demand prediction. Copyright (C) 1996 IJCAI Inc.", keywords = "water demand prediction, knowledge discovery, rough sets", } @Article{anand.ea:edm-general:96, author = "S. S. Anand and D. A. Bell and J. G. Hughes", address = "Univ Ulster, Fac Informat, Sch Informat \& Software Engn, Jordanstown, North Ireland", title = "Edm - a general framework for data mining based on evidence theory", journal = "Data \& Knowledge Engineering", year = "1996", volume = "18", issue = "3", pages = "189--223", abstract = "Data Mining or Knowledge Discovery in Databases [1,15,23] is currently one of the most exciting and challenging areas where database techniques are coupled with techniques from Artificial Intelligence and mathematical sub-disciplines to great potential advantage. It has been defined as the non- trivial extraction of implicit, previously unknown and potentially useful information from data. A lot of research effort is being directed towards building tools for discovering interesting patterns which are hidden below the surface in databases. However, most of the work bring done in this field has been problem-specific and no general framework has yet been proposed for Data Mining. In this paper we seek to remedy this by proposing, EDM - Evidence-based Data Mining - a general framework for Data Mining based on Evidence Theory. Having a general framework for Data Mining offers a number of advantages. It provides a common method for representing knowledge which allows prior knowledge from the user or knowledge discovered by another discovery process to be incorporated into the discovery process. A common knowledge representation also supports the discovery of meta- knowledge from knowledge discovered by different Data Mining techniques. Furthermore, a general framework can provide facilities that are common to most discovery processes, e.g. incorporating domain knowledge and dealing with missing values. The framework presented in this paper has the following additional advantages. The framework is inherently parallel. Thus, algorithms developed within this framework will also be parallel and will therefore be expected to be efficient for large data sets - a necessity as most commercial data sets, relational or otherwise, are very large. This is compounded by the fact that the algorithms are complex. Also, the parallelism within the framework allows its use in parallel, distributed and heterogeneous databases. The framework is easily updated and new discovery methods can be readily incorporated within the framework, making it 'general' in the functional sense in addition to the representational sense considered above. The framework provides an intuitive way of dealing with missing data during the discovery process using the concept of Ignorance borrowed from Evidence Theory. The framework consists of a method for representing data and knowledge, and methods for data manipulation or knowledge discovery(1). We suggest an extension of the conventional definition of mass functions in Evidence Theory for use in Data Mining, as a means to represent evidence of the existence of rules in the database. The discovery process within EDM consists of a series of operations on the mass functions. Each operation is carried out by an EDM operator. We provide a classification for the EDM operators based on the discovery functions performed by them and discuss aspects of the induction, domain and combination operator classes. The application of EDM to two separate Data Mining tasks is also addressed, highlighting the advantages of using a general framework for Data Mining in general and, in particular, using one that is based on Evidence Theory.", keywords = "DATA MINING, KNOWLEDGE DISCOVERY IN DATABASES, UNCERTAINTY HANDLING, EVIDENCE THEORY, PARALLEL DISCOVERY", } @Article{anand.ea:high-performance-server:96, author = "S. S. Anand and D. A. Bell and J. G. Hughes and C. M. Shapcott", title = "A High-Performance Data Mining Server", journal = "Lecture Notes in Computer Science", volume = "1067", pages = "907--??", year = "1996", ISSN = "0302-9743", } @InProceedings{anand.ea:parallel:95, author = "S. S. Anand and C. Shapcott and D. Bell and J. Hughes", title = "Data mining in parallel", volume = "44", series = "Transputer and Occam Engineering", pages = "113--124", booktitle = "Proceedings of WoTUG-18: Transputer and occam Developments", year = "1995", publisher = "IOS Press", address = "Amsterdam", month = apr, ISBN = "ISBN 90-5199-222-x", } @Article{anonymous:getting-to:94, author = "anonymous", title = "Getting to grips with arrears: `data mining' systems at the {L}eeds", journal = "Expert Systems", year = "1994", volume = "11", number = "2", pages = "122--124", month = may, keywords = "Applications, Data mining, kdd, Attar Software, Xpert Rule Analyser", } @Article{anonymous:intelligent-technology:93, author = "anonymous", title = "Data Mining: Intelligent Technology Gets down to Business", journal = "PC AI", year = "1993", month = nov # " - " # dec, } @Article{anonymous:lessons:97, author = "Anonymous", title = "Lessons in Data Mining", journal = "Byte Magazine", volume = "22", number = "2", pages = "40--??", month = feb, year = "1997", ISSN = "0360-5280", } @Proceedings{anonymous:sigmod-93:93, editor = "Anonymous", booktitle = "SIGMOD '93. 1993 ACM SIGMOD. International Conference on Management of Data", title = "{SIGMOD} '93. 1993 {ACM} {SIGMOD}. International Conference on Management of Data", volume = "22(2)", month = jun, publisher = "ACM Press", address = "New York, NY 10036, USA", year = "1993", ISSN = "0163-5808", series = "SIGMOD Record (ACM Special Interest Group on Management of Data)", classification = "C6160 (Database management systems (DBMS)); C4250 (Database theory); C7250 (Information storage and retrieval); C6170 (Expert systems); C6120 (File organisation); C6140D (High level languages); C6130 (Data handling techniques); C6150G (Diagnostic, testing, debugging and evaluating systems)", confdate = "26--28 May 1993", conflocation = "Washington, DC, USA", confsponsor = "ACM", keywords = "Benchmark programs; Database rules; Integrity; Join processing; Object-oriented databases; Memory-based implementations; DBMS implementation issues; Recovery; Knowledge discovery; Temporal reasoning; Data compression; Query optimisation; Secondary storage techniques; Search structures; Query languages; Interfaces; Intelligent/deductive DBMSs; Relational/parallel DBMS processing; Transaction management; Object/scientific DBMSs; Interoperability", thesaurus = "Data compression; Database management systems; Database theory; Inference mechanisms; Knowledge based systems; Program testing; Query languages; Query processing; Storage management; System recovery; Transaction processing", } @Article{anonymous:supercomputers-knock-at-is-doors:92, author = "Anonymous", title = "{Supercomputers Knock At {IS} Doors}", journal = "Datamation", volume = "38", number = "24", pages = "79--??", day = "01", month = dec, year = "1992", ISSN = "0011-6963", abstract = "Cost-effective massively parallel designs gain converts for data mining and OLTP applications among leading edge users and traditional systems suppliers.", } @Book{anthony.ea:computational-learning:92, author = "Martin Anthony and Norman Biggs", title = "Computational learning theory: an introduction", year = "1992", publisher = "Cambridge University Press", series = "Cambridge Tracts in Theoretical Computer Science", volume = "30", } @InProceedings{anwar.ea:by-imprecise:92, author = "T. M. Anwar and H. W. Beck and S. B. Navathe", title = "Knowledge Mining by Imprecise Querying: {A} Classification-based System", booktitle = "Proceedings of the International Conference on Data Engineering", address = "Tempe, AZ", month = feb, year = "1992", pages = "622--630", abstract = "Knowledge mining is the process of discovering new knowledge that is hitherto unknown. Users with a lack of knowledge of database schemas engage in the process of knowledge mining by posing imprecise queries. An approach to knowledge mining by imprecise querying is presented that utilizes conceptual clustering techniques. In contrast to numeric or fuzzy set approaches which ultimately rely on some distance metric and threshold to processing such queries, conceptual clustering retrieves instances which are structurally, semantically, and pragmatically similar to the query even though they may not match the requirements exactly. The query processor has both a deductive and inductive component. The deductive component finds precise matches in the traditional sense, and the inductive component identifies ways in which imprecise matches may be considered similar. Ranking on similarity is done using the database taxonomy, by which similar instances become members of the same class. Relative similarity is determined by depth in the taxonomy. The conceptual clustering algorithm, its use in query processing and an example are presented.", } @Article{appleton:sales-surge:95, author = "E. L. Appleton", title = "Sales surge as mainframes find a role in client\slash server", journal = "Datamation", volume = "41", number = "10", pages = "48", month = jun, year = "1995", ISSN = "0011-6963", classification = "D5010 (Computers and work stations); D5020 (Computer networks and intercomputer communications)", keywords = "Mainframes; Client/server; Demand; Economy; Large-system market; Vendors; IBM Parallel Sysplex; UNIX server; NT server; Pyramid; HP T-500; Data mining; Parallelism; IBM Power Parallel; Amdahl ECL mainframe", language = "English", pubcountry = "USA", thesaurus = "Client-server systems; DP industry; Mainframes", } @InProceedings{apte.ea:predicting-defects:93, author = "Chidanand Apt\'e and Sholom Weiss and Gordon Grout", title = "Predicting defects in Disk Drive Manufacturing: a case study in High-Dimensional Classification", booktitle = "Proceedings of the 9th Conference on Artificial Intelligence for Applications", pages = "212--218", address = "Orlando, Florida", year = "1993", } @InProceedings{arning.ea:linear-method:96, title = "A Linear Method for Deviation Detection in Large Databases", pages = "164", author = "Andreas Arning and Rakesh Agrawal and Prabhakar Raghavan", crossref = "simoudis.ea:proceedings-second:96", } @InProceedings{aronis.ea:exploiting-background:96, title = "Exploiting Background Knowledge in Automated Discovery", pages = "355", author = "John M. Aronis and Foster J. Provost and Bruce G. Buchanan", crossref = "simoudis.ea:proceedings-second:96", } @InProceedings{aronis.ea:increasing-efficiency:97, title = "Increasing the Efficiency of Data Mining Algorithms with Breadth-First Marker Propagation", author = "John M. Aronis and Foster J. Provost", pages = "119", crossref = "heckerman.ea:proceedings-third:97", } @Article{ash.ea:lead-identification:97, author = "S. Ash and S. Gothe", address = "Tripos Inc, St Louis, Mo, 63144", title = "Data mining for lead identification and explosion", journal = "Abstracts Of Papers Of The American Chemical Soc.", year = "1997", volume = "213", issue = "Pt1", pages = "57--CINF", } @TechReport{back.ea:managing-complexity:96, author = "Barbro Back and Mikko Irjala and Kaisa Sere and Hannu Vanharanta", title = "Managing Complexity in Large Data Bases Using Self-Organizing Maps", institution = "TUCS - Turku Centre for Computer Science", number = "TUCS-TR-48", month = oct # " 23", year = "1996", keywords = "neural networks, self-organizing maps, data bases, benchmarking", URL = "http://www.tucs.abo.fi/publications/techreports/TR48.html", abstract = "The amount of financial information in today's sophisticated large data bases is huge and makes comparisons between company performance - especially over time - difficult or at least very time consuming. The aim of this paper is to invest igate whether neural networks in the form of self-organizing maps can be used to manage the complexity in large data bases. We structure and analyze accoun ting numbers in a large data base over several time periods. By using self organizing maps, we overcome the problems associated with finding the appropriate und erlying distribution and the functional form of the underlying data in the structuring task that is often encountered, for example, when using cluster analysis. The method chosen also offers a way of visualizing the results. The data base in this study consists of annual reports of more than 120 world wide forest companies with data from a five year time period. This paper is an extended version of our paper Data Mining Accambis Numbers Using Self Organising Maps presented at Finnish Artificial Intelligenc e Conference in Vasa 20-23 August 1996.", } @InProceedings{bain.ea:reduce-automatic:96, author = "B. Bain and C. Sammut and A. Sharma and J. Shepherd", title = "{R}e{D}uce: {A}utomatic Structuring and Compression in Relational Databases", booktitle = "Proceedings of the MLnet Familiarization Workshop on Data Mining with Inductive Logic Programing", pages = "41--52", year = "1996", } @Article{baldwin:using-fuzzy:96, author = "J. F. Baldwin", address = "Univ Bristol, Dept Engn Math, Bristol, Avon, England", title = "Knowledge from data using fuzzy methods", journal = "Pattern Recognition Letters", year = "1996", volume = "17", issue = "6", pages = "593--600", abstract = "The basic concept of a data browser is explained and some methods are described which are suitable for extracting knowledge from data as an induction process. The data browser gives data mining capabilities but also provides a stage for computers and users to act out their parts in this knowledge discovery process.", } @Article{basta:molecules-to:96, author = "N. Basta", address = "Us Dept Def, Off Infosec Comp Sci, Ft George G Meade, Md, 20755", title = "From molecules to models to data mining", journal = "Chemical Engineering", year = "1996", volume = "103", issue = "2", pages = "5--5", } @InProceedings{bayardo:brute-force-high-confidence:97, title = "Brute-Force Mining of High-Confidence Classification Rules", author = "Jr. Roberto J. Bayardo", pages = "123", crossref = "heckerman.ea:proceedings-third:97", } @InProceedings{bell:maintenance-functional:95, author = "S. Bell", title = "Discovery and Maintenance of Functional Dependencies by Independencies", booktitle = "Proceedings of the Workshop on Knowledge Discovery in Databases", pages = "27--32", publisher = "AAAI Press", year = "1995", } @Article{bell:properties-to:93, author = "D. A. Bell", address = "Univ Ulster, Dept Informat Sci, Jordanstown Bt37 0Qb, Antrim, North Ireland", title = "From data properties to evidence", journal = "Ieee Trans. On Knowledge And Data Engineering", year = "1993", volume = "5", issue = "6", pages = "965--969", abstract = "Information and knowledge in computerized information systems are often characterized by uncertainty. The facts needed for some realistic applications are unavailable or are crudely estimated or judged. This problem manifests itself frequently in information systems centered on databases. We describe here an exploration of an aspect of the problem of handling uncertain evidence on which reasoning is to be based. We focus upon the problem of making decisions among propositions based on both uncertain data items (in contrast to data in conventional databases) and arguments which are not certain. The primary knowledge discovery issue we address is a classification problem - which classification does the available evidence support? The method investigated here seeks to exploit information available from conventional database systems - namely, the integrity assertions or data dependency information contained in the database. This information, e.g., from functional dependencies and a form of multivalued dependencies, allows us to rank arguments in terms of their strengths. Hence, as a step in the process of discovering classification knowledge, using a database as a secondary knowledge discovery exercise, we explicate latent knowledge pertinent to arguments of relevance to the purpose at hand. This is called evidence. Information is requested via user prompts from an evidential reasoner. It is fed as evidence to the reasoner. An object-oriented structure for managing evidence is used to model the conclusion space and to reflect the evidence structure. The implementation of the evidence structure and an example of its use are outlined.", keywords = "CLASSIFICATION, DATA DEPENDENCIES, DATABASE, EVIDENCE BASE, EVIDENTIAL REASONING, INTEGRITY CONSTRAINTS", } @InProceedings{bell:value-added-evidential:94, title = "Value-added databases: knowledge discovery and evidential reasoning.", author = "D. Bell", booktitle = "Proceedings of the International Workshop on Advances in Databases and Information Systems - {ADBIS'94}", address = "Moscow", year = "1994", month = may # " 23--26", pages = "2--9", abstract = "Results of research into methods of managing evidence can be coupled with the power and capacity of data management systems to give a potent approach to discovering interesting but hidden patterns in large collections of data. We present some pertinent results from evidence theory and its applications, and suggest an approach to the exploitation of these results in the discovery of knowledge which is held in databases. In this sense we {\em add value} to databases, which presumably already justify their existence, and hence further increase the attractiveness of very large database systems.", } @TechReport{bentrup.ea:examination-inductive:93, author = "John A. Bentrup and Sylvian R. Ray", title = "An Examination of Inductive Learning Algorithms for the Classification of Sleep Signals", institution = "Department of Computer Science, University of Illinois at Urbana-Champaign", type = "Report.", number = "UIUCDCS-R-93-1792", address = "1304 Springfield Avenue, Urbana, Il 61801", month = feb, year = "1993", URL = "ftp://a.cs.uiuc.edu/pub/TechReports/UIUCDCS-R-93-1792.ps.Z", note = "Modified version to appear in Proceedings of the 30th Annual Rocky Mountain Bioengineering Symposium (April 1993).", annote = "Nine inductive learning algorithms are tested on sleep signals of 161 subjects. Algorithms are ID3, C4, CART, MDL, AIMS, Bayes, PLS(K), PRG, Nearest Neighbour and COBWEB. Nice table summarising algorithms.", } @InCollection{bergadano.ea:integrated-learning:91, editor = "Gregory Piatetsky-Shapiro and William J. Frawley", booktitle = "Knowledge Discovery in Databases", publisher = "AAAI Press / The MIT Press", address = "Menlo Park, California", edition = "1st", year = "1991", author = "F. Bergadano and A. Giordana and L. Saitta", title = "Integrated Learning in a Real Domain", pages = "277--288", } @InProceedings{bergsten.ea:applying-machine:97, title = "Applying Data Mining and Machine Learning Techniques to Submarine Intelligence Analysis", author = "Ulla Bergsten and Johan Schubert and Per Svensson", pages = "127", crossref = "heckerman.ea:proceedings-third:97", } @Article{berleant.ea:hot-topics:94, author = "D. Berleant and H. Berghel", title = "Hot Topics: Customizing information. 2. {How} successful are we so far?", journal = "Computer", volume = "27", number = "10", pages = "76--78", month = oct, year = "1994", ISSN = "0018-9162", affiliation = "Dept. of Comput. Syst. Eng., Arkansas Univ., Fayetteville, AR, USA", classification = "C6130D (Document processing techniques); C7210 (Information services and centres); C7250N (Front end systems for online searching)", keywords = "Advanced information customization; Browsing; Data interchange; Digital library; Document customization; Filtering; Hypermedia; Hypertext; Information analysis; Information extraction; Information retrieval; Information science; Information-customizing interfaces; Interactivity; Knowledge discovery; Nonprescriptive structuring", thesaurus = "Document handling; Full-text databases; Hypermedia; Information retrieval; Online front-ends", } @InProceedings{bernstein:enactment-information:93, author = "Mark Bernstein", title = "Enactment in Information Farming", booktitle = "Proceedings of ACM Hypertext'93", series = "Technical Briefings", pages = "242--249", year = "1993", copyright = "(c) Copyright 1993 Association for Computing Machinery", keywords = "Design, Rhetoric, Enactment, Collaboration, Information farming", abstract = "Information farming views the cultivation of information as a continuing, collaborative activity performed by groups of people working together to achieve changing individual and common goals. Failure to differentiate information farming from related but distinct activities like information mining and data factories has been a fruitful source of misunderstanding and discord in the hypertext literature and in the design of hypertext environments. Dramatic enactment and visual salience -- not recall, precision, or usability -- assume primary roles in design for information gardening. In this technical briefing, we examine how enactment contribute to the success and failure of a variety of Hypergate and Storyspace features.", } @InProceedings{berry.ea:computational-methods:95, author = "Michael W. Berry and Susan T. Dumais and Todd A. Letsche", title = "Computational Methods for Intelligent Information Access", booktitle = "Proceedings of Supercomputing'95", publisher = "ACM/IEEE", address = "San Diego, CA", month = dec, year = "1995", keywords = "data mining, indexing, information, latent, matrices, retrieval, semantic, singular value decomposition (SVD), sparse, updating,", abstract = "ps/PDF on the CD with MPEG.", } @InProceedings{bettini.ea:testing-complex:96, author = "C. Bettini and X. {Sean Wang} and S. Jajodia", title = "Testing Complex Temporal Relationships Involving Multiple Granularities and Its Application to Data Mining", editor = "{ACM}", booktitle = "Proceedings of the Fifteenth {ACM} {SIGACT}-{SIGMOD}-{SIGART} Symposium on Principles of Database Systems, {PODS} 1996, Montr{\'e}al, Canada, June 3--5, 1996", volume = "15", publisher = "ACM Press", address = "New York, NY 10036, USA", year = "1996", series = "Proceedings of the ACM SIGACT SIGMOD SIGART Symposium on Principles of Database Systems", pages = "68--78", annote = "Held in conjunction with the 1996 ACM SIGMOD international conference on management of data. Also known as PODS 1996", keywords = "database systems; PODS; ACM; SIGMOD; SIGART; SIGACT", } @Article{bettini:time-dependent-concepts:97, author = "C. Bettini", address = "Univ Milan, Dipartimento Sci Informaz, I-20122 Milan, Italy", title = "Time-dependent concepts: representation and reasoning using temporal description logics", journal = "Data \& Knowledge Engineering", year = "1997", volume = "22", issue = "1", pages = "1--38", abstract = "A time-dependent concept is a conceptual entity that is defined in terms of temporal relationships with other entities. For example, the concept of an action is defined in terms of a set of temporal relationships among states of a system. The concept of ''widow'', in natural language, is defined in terms of events that have occurred in the past. Time-dependent concepts appear in several application areas, from natural language to diagnosis, from planning to data mining. An interesting issue in knowledge representation is how to formally represent and reason with these concepts. In this paper, we represent a family of formal representation languages obtained as an interval-based temporal extension of description logics. We illustrate the expressiveness of these formalisms in representing time-dependent concepts with respect to standard description logics and other extensions. We give some complexity results for reasoning problems and we propose approximate algorithms to compute subsumption among time-dependent concepts.", keywords = "INTERVALS, temporal knowledge, temporal reasoning, description logics, taxonomies, subsumption algorithms, temporal objects", } @Article{bhandari.ea:advanced-scout:97, author = "Inderpal Bhandari and Ed Colet and Jennifer Parker and Zachary Pines and Rajiv Pratap and Krishnakumar Ramanujam", title = "Advanced Scout: Data Mining and Knowledge Discovery in {NBA} data", journal = "Data Mining and Knowledge Discovery", year = "1997", volume = "1", number = "1", annote = "Advanced Scout is a PC-based data mining application used by National Basketball Association (NBA) coaching staffs to discover interesting patterns in basketball game data. We describe Advanced Scout software from the perspective of data mining and knowledge discovery. This paper highlights the pre-processing of raw data that the program performs, describes the data mining aspects of the software and how the interpretation of patterns supports the process of knowledge discovery. The underlying technique of attribute focusing as the basis of the algorithm is also described. The process of pattern interpretation is facilitated by allowing the user to relate patterns to video tape.", } @Article{bhandari.ea:case-study-software:93, author = "I. Bhandari and M. Halliday and E. Tarver and D. Brown and J. Chaar and R. Chillarege", address = "Ibm Corp, Thomas J Watson Res Ctr, Yorktown Hts, Ny, 10598 Ibm Corp, Mid Hudson Valley Programming Lab, Wappingers Falls, Ny, 12590", title = "A case-study of software process improvement during development", journal = "Ieee Trans. On Software Engineering", year = "1993", volume = "19", issue = "12", pages = "1157--1170", abstract = "We present a case study of the use of a software process improvement method which is based on the analysis of defect data. The first step of the method is the classification of software defects using attributes which relate defects to specific process activities. Such classification captures the semantics of the defects in a fashion which is useful for process correction. The second step utilizes a machine- assisted approach to data exploration which allows a project team to discover such knowledge from defect data as is useful for process correction. We show that such analysis of defect data can readily lead a project team to improve their process during development.", keywords = "CYCLE, DATE EXPLORATION, DEFECT-BASED PROCESS IMPROVEMENT, IN-PROCESS METRICS, KNOWLEDGE DISCOVERY", } @Article{bhandari:attribute-focusing:94, author = "I. Bhandari", address = "Ibm Corp, Thomas J Watson Res Ctr, Yorktown Hts, Ny, 10598", title = "Attribute focusing - machine-assisted knowledge discovery applied to software production process-control", journal = "Knowledge Acquisition", year = "1994", volume = "6", issue = "3", pages = "271--294", abstract = "How can people who are not trained in data analysis discover knowledge from a database of attribute-valued data? I address this question by presenting a man-machine approach to knowledge discovery called Attribute Focusing and its application to software production process control. Attribute Focusing utilizes an automatic filter to focus attention on that small part of a large amount of data which is interesting. A person studies that part in a manner which leads him to discover knowledge about the physical situation to which the data pertain. Specifically, the paper describes: 1. A model of interestingness of data based on the magnitude of data values, the association of data values and basic knowledge of the limits of human processing. 2. The use of that model of interestingness by people to discover knowledge. 3. The application of the Attribute Focusing approach to diagnose and correct the software production process. Based on the results that have been observed, the paper concludes that man-machine approaches to knowledge discovery should be emphasized much more than has been in the past, and that Attribute Focusing is a powerful, practical approach to such discovery.", } @Article{bissantz.ea:data-mining:93, author = "N. Bissantz and J. Hagedorn", address = "Ibm Corp, Thomas J Watson Res Ctr, Yorktown Hts, Ny, 10598 Ibm Corp, Mid Hudson Valley Programming Lab, Wappingers Falls, Ny, 12590", title = "Data mining", journal = "Wirtschaftsinformatik", year = "1993", volume = "35", issue = "5", pages = "481--487", } @InProceedings{blockeel.ea:relational:96, author = "H. Blockeel and L. De Raedt", title = "Relational knowledge discovery in databases", booktitle = "Proceedings of the 6th International Workshop on Inductive Logic Programming", editor = "S. Muggleton", publisher = "Stockholm University, Royal Institute of Technology", pages = "1--13", year = "1996", } @Article{blum:confirmation-incorporation:82, author = "Robert L. Blum", title = "Discovery, Confirmation and Incorporation of Causal Relationships from a Large Time-Oriented Clinical Database: The {RX} Project", journal = "Computers and Biomedical Research", volume = "15", pages = "164--187", year = "1982", } @Book{blum:representation-causal:82, author = "Robert L. Blum", title = "Discovery and Representation of Causal Relationships from a Large Time-Oriented Clinical Database: The {RX} Project", year = "1982", publisher = "Spinger-Verlag", series = "Lecture Notes in Medical Informatics", volume = "19", } @Article{blumer.ea:occams-razor:87, author = "Anselm Blumer and Andrzej Ehrenfeucht and David Haussler and Manfred K. Warmuth", title = "Occam's Razor", journal = "Information processing letters", volume = "24", pages = "377--380", year = "1987", } @InProceedings{breitner.ea:process-based-database:97, title = "Process-Based Database Support for the Early Indicator Method", author = "Christoph Breitner and J{\"{o}}rg Schl{\"{o}}sser and R{\"{u}}diger Wirth", pages = "131", crossref = "heckerman.ea:proceedings-third:97", } @InProceedings{brezellec.ea:samia-bottom-up:93, author = "Pierre Br\'ezellec and Henri Soldano", title = "{SAMIA}: a bottom-up learning method using a simulated annealing algorithm", booktitle = "Proceedings of the European conference on Machine Learning", series = "Lecture notes in Artificial Intelligence", pages = "297--309", publisher = "Springer-verlag", year = "1993", } @InProceedings{brockhausen.ea:direct-access:96, author = "P. Brockhausen and K. Morik", title = "Direct Access of an {ILP} Algorithm to a Database Management System", booktitle = "Proceedings of the MLnet Familiarization Workshop on Data Mining with Inductive Logic Programing", pages = "95--110", year = "1996", } @Article{brodley.ea:applying-classification:, author = "C. E. Brodley and P. Smyth", title = "Applying classification algorithms in practice (preprint)", journal = "(To appear) Statistics and Computing", URL = "http://yake.ecn.purdue.edu/~brodley/my-papers/publications.html", } @InProceedings{browne.ea:distributed-information:95, author = "Shirley Browne and Jack Dongarra and Geoffrey C. Fox and Ken Hawick and Ken Kennedy and Rick Stevens and Robert Olson and Tom Rowan", title = "Distributed Information Management in the National {HPCC} Software Exchange", booktitle = "Proceedings of Supercomputing'95", publisher = "ACM/IEEE", address = "San Diego, CA", month = dec, year = "1995", keywords = "data mining, information management, information retrieval, HPCC, high performance computing, software repository,", abstract = "Simple html document on CD.", } @InProceedings{brunk.ea:mineset-integrated:97, title = "MineSet: An Integrated System for Data Mining", author = "Cliff Brunk and James Kelly and Ron Kohavi", pages = "135", crossref = "heckerman.ea:proceedings-third:97", } @Article{buntine:guide-to:96, author = "W. Buntine", address = "Thinkbank, 1678 Shattuck Ave, Suite 320, Berkeley, Ca, 94709", title = "A guide to the literature on learning probabilistic networks from data", journal = "Ieee Trans. On Knowledge And Data Engineering", year = "1996", volume = "8", issue = "2", pages = "195--210", abstract = "This literature review discusses different methods under the general rubric of learning Bayesian networks from data, and includes some overlapping work on more general probabilistic networks. Connections are drawn between the statistical, neural network, and uncertainty communities, and between the different methodological communities, such as Bayesian, description length, and classical statistics. Basic concepts for learning and Bayesian networks are introduced and methods are then reviewed. Methods are discussed for learning parameters of a probabilistic network, for learning the structure, and for learning hidden variables. The presentation avoids formal definitions and theorems, as these are plentiful in the literature, and instead illustrates key concepts with simplified examples.", keywords = "EXPERT-SYSTEMS, BAYESIAN NETWORKS, GRAPHICAL MODELS, INDEPENDENCE, COMPLEXITY, BAYESIAN NETWORKS, GRAPHICAL MODELS, HIDDEN VARIABLES, LEARNING, LEARNING STRUCTURE, PROBABILISTIC NETWORKS, KNOWLEDGE DISCOVERY", } @InCollection{cai.ea:attribute-oriented-induction:91, editor = "Gregory Piatetsky-Shapiro and William J. Frawley", booktitle = "Knowledge Discovery in Databases", publisher = "AAAI Press / The MIT Press", address = "Menlo Park, California", edition = "1st", year = "1991", author = "Yandong Cai and Nick Cercone and Jaiwei Han", title = "Attribute-Oriented Induction in Relational Databases", pages = "213--228", } @InCollection{carbonell.ea:overview-machine:83, author = "Jaime G. Carbonell and Ryszard S. Michalski and Tom M. Mitchell", title = "An overview of machine learning", pages = "3--24", crossref = "michalski.ea:machine-learning:83", } @Article{carter.ea:assessing-credit:87, author = "Chris Carter and Jason Catlett", title = "Assessing Credit Card Applications Using Machine Learning", journal = "IEEE Expert", pages = "71--79", volume = "Fall 1987", year = "1987", } @Article{carter.ea:fast-online:95, author = "C. L. Carter and H. J. Hamilton", address = "Univ Regina, Dept Comp Sci, Regina, Sk S4S 0A2, Canada", title = "A fast, online generalization algorithm for knowledge discovery", journal = "Applied Mathematics Letters", year = "1995", volume = "8", issue = "2", pages = "5--11", abstract = "We present an O(n) algorithm for generalizing a database relation using concept hierarchies, where n is the number of tuples in the input relation. The algorithm is based on a variant of Han et al.'s attribute-oriented O(n log n) algorithm. Our algorithm is an on-line algorithm; fast performance is achieved because after encountering a tuple and generalizing it, the location of the appropriate counter to increment is calculated instead of searched for.", keywords = "KNOWLEDGE DISCOVERY, DATA MINING, DATABASES, CONCEPT HIERARCHIES, GENERALIZATION", } @PhdThesis{catlett:megainduction-machine:91, title = "Megainduction: machine learning on very large databases", year = "1991", author = "Jason Catlett", URL = "http://www.research.att.com/orgs/ssr/people/catlett/phd.html", } @Article{cercone.ea:ieee-transactions:93, key = "cercone.ea:ieee-transactions:93", title = "{IEEE} Transactions on Knowledge and Data Engineering Special issue on Learning and Discovery in Databases", journal = "IEEE Transactions on Knowledge and Data Engineering", year = "1993", volume = "5", number = "6", month = dec, editor = "N. Cercone and M. Tsuchiya (guest editors)", note = "Special issue on Learning and Discovery in Databases", } @InProceedings{cerquides.ea:proposal-empirical:97, title = "Proposal and Empirical Comparison of a Parallelizable Distance-Based Discretization Method", author = "Jes\'{u}s Cerquides and Ramon L\'{o}pez de M\`{a}ntaras", pages = "139", crossref = "heckerman.ea:proceedings-third:97", } @InProceedings{chan.ea:experiments-multistrategy:93, author = "Philip K. Chan and Salvatore J. Stolfo", title = "Experiments in Multistrategy Learning by Meta-Learning", booktitle = "Proceedings of the second international conference on information and knowledge management", pages = "314--323", address = "Washington, DC", year = "1993", } @InProceedings{chan.ea:sharing-learned:96, title = "Sharing Learned Models among Remote Database Partitions by Local Meta-Learning", pages = "2", author = "Philip K. Chan and Salvatore J. Stolfo", crossref = "simoudis.ea:proceedings-second:96", } @Article{chatfield:model-uncertainty:95, author = "C. Chatfield", address = "Univ Bath, Sch Math Sci, Bath Ba2 7Ay, Avon, England", title = "Model uncertainty, data mining and statistical-inference", journal = "J. Of The Royal Statistical Soc. Series A-Statistics In Society", year = "1995", volume = "158", issue = "Pt3", pages = "419--466", abstract = "This paper takes abroad, pragmatic view of statistical inference to include all aspects of model formulation. The estimation of model: parameters traditionally assumes that a model has a prespecified known form and takes no account of possible uncertainty regarding the model structure. This implicitly assumes the existence of a 'true' model, which many would regard-as a fiction. In practice model uncertainty is a fact of life and likely to be more serious than other sources of uncertainty which have received far more attention from statisticians. This is true whether the model is specified on subject-matter grounds or, as is increasingly the case, when a model is formulated, fitted and checked on the same data set in an iterative, interactive way. Modern computing power allows a large number of models to be considered and data-dependent specification searches have become the norm in many areas of statistics. The term data mining may be used in this context when the analyst goes to great lengths to obtain a good fit. This paper reviews the effects of model uncertainty, such as too narrow prediction intervals, and the non-trivial biases in parameter estimates which can follow data-based modelling. Ways of assessing and overcoming the effects of model uncertainty are discussed, including the use of simulation and resampling methods, a Bayesian model averaging approach and collecting additional data wherever possible. Perhaps the main aim of the paper is to ensure that statisticians are aware of the problems and start addressing the issues even if there is no simple, general theoretical fix.", keywords = "MOVING AVERAGE MODELS, BOOTSTRAP, VALIDATION, PREDICTION, COMPLEXITY, SELECTION, CHOICE, AUTOREGRESSIVE MODEL, BAYESIAN MODEL AVERAGING, DATA MINING, FORECASTING, MODEL BUILDING, RESAMPLING, STATISTICAL INFERENCE, SUBSET SELECTION", } @InProceedings{chattratichat.ea:large-scale:97, title = "Large Scale Data Mining: Challenges and Responses", author = "Jaturon Chattratichat and John Darlington and Moustafa Ghanem and Harald H{\"{u}}ning Yike Guo and Martin K{\"{o}}hler and Janjao Sutiwaraphun and Hing Wing To and Dan Yang", pages = "143", crossref = "heckerman.ea:proceedings-third:97", } @InCollection{cheeseman.ea:bayesian-classification:95, author = "P. Cheeseman and J. Stutz", title = "Bayesian Classification ({AUTOCLASS}): Theory and Results", booktitle = "Advances in Knowledge Discovery and Data Mining", editor = "U. M. Fayyad and G. Piatetsky-Shapiro and P Smyth and R. Uthurusamy", year = "1995", } @InProceedings{chen.ea:efficient-path:96, author = "M. S. Chen and J. S. Park and P. S. Yu", title = "Efficient Data Mining for Path Traversal Patterns in Distributed Systems", booktitle = "16th International Conference on Distributed Computing Systems (16th IDCS'96)", pages = "385--393?", publisher = "IEEE", address = "Hong Kong", month = may, year = "1996", keywords = "Distributed Objects,", note = "IBM T. J. Watson Research Center, USA", } @Article{chen.ea:overview-database:96, author = "Ming-Syan Chen and Jiawei Han and Philip S. Yu", address = "Natl Taiwan Univ, Dept Elect Engn, Taipei 10764, Taiwan Simon Fraser Univ, Sch Comp Sci, Burnaby, Bc V5A 1S6, Canada Ibm Corp, Thomas J Watson Res Ctr, Yorktown Hts, Ny, 10598", title = "Data mining: an overview from a database perspective", journal = "Ieee Trans. On Knowledge And Data Engineering", year = "1996", month = dec, volume = "8", issue = "6", pages = "866--883", abstract = "Mining information and knowledge from large databases has been recognized by many researchers as a key research topic in database systems and machine learning, and by many industrial companies as an important area with an opportunity of major revenues. Researchers in many different fields have shown great interest in data mining. Several emerging applications in information providing services, such as data warehousing and on-line services over the Internet, also call for various data mining techniques to better understand user behavior, to improve the service provided, and to increase the business opportunities. In response to such a demand, this article is to provide a survey, from a database researcher's point of view, on the data mining techniques developed recently. A classification of the available data mining techniques is provided, and a comparative study of such techniques is presented.", keywords = "data mining, knowledge discovery, association rules, classification, data clustering, pattern matching algorithms, data generalization and characterization, data cubes, multiple-dimensional databases", } @Article{chen.ea:parallel-computing:96, author = "H. C. Chen and B. Schatz and T. Ng and J. Martinez and A. Kirchhoff and C. T. Lin", address = "Univ Arizona, Karl Eller Grad Sch Management, Mis Dept, Mcclelland Hall, Tucson, Az, 85721 Univ Illinois, Natl Ctr Supercomp Applicat, Beckman Inst, Urbana, Il, 61801 Univ Arizona, Sci \& Engn Lib, Tucson, Az, 85712 Univ Arizona, Dept Lib \& Informat Studies, Tucson, Az, 85712", title = "A parallel computing approach to creating engineering concept spaces for semantic retrieval - the illinois digital library initiative project", journal = "Ieee Trans. On Pattern Analysis And Machine Intelligence", year = "1996", volume = "18", issue = "8", pages = "771--782", abstract = "This research presents preliminary results generated from the semantic retrieval research component of the illinois Digital Library Initiative (DLI) project. Using a variation of the automatic thesaurus generation techniques, to which we refer as the concept space approach, we aimed to create graphs of domain-specific concepts (terms) and their weighted co-occurrence relationships for all major engineering domains. Merging these concept spaces and providing traversal paths across:different concept spaces could potentially help alleviate the vocabulary (difference) problem evident in large- scale information retrieval. We have experimented previously with such a technique for a smaller molecular biology domain (Worm Community System, with 10+ MBs of document collection) with encouraging results. In order to address the scalability issue related to large-scale information retrieval and analysis for the current Illinois DLI project, we recently conducted experiments using the concept space approach on parallel supercomputers. Our test collection included 2+ GBs of computer science and electrical engineering abstracts extracted from the INSPEC database. The concept space approach called for extensive textual and statistical analysis (a form of knowledge discovery) based on automatic indexing and cooccurrence analysis algorithms, both previously tested in the biology domain. Initial testing results using a 512-node CM-5 and a 16-processor SGI Power Challenge were promising. Power Challenge was later selected to create a comprehensive computer engineering concept space of about 270,000 terms and 4,000,000+ links using 24.5 hours of CPU time. Our system evaluation involving 12 knowledgeable subjects revealed that the automatically-created computer engineering concept space generated significantly higher concept recall than the human- generated INSPEC computer engineering thesaurus. However, the INSPEC was more precise than the automatic concept space. Our current work mainly involves creating concept spaces for other major engineering domains and developing robust graph matching and traversal algorithms for cross-domain, concept-based retrieval. Future work also will include generating individualized concept spaces for assisting user- specific concept-based information retrieval.", keywords = "INFORMATION-RETRIEVAL, DOCUMENT-RETRIEVAL, CONNECTION MACHINE, NEURAL NETWORKS, SYSTEMS, SEARCH, PERFORMANCE, DATABASES, DESIGN, MODEL, SEMANTIC RETRIEVAL, CONCEPT SPACE, CONCEPT ASSOCIATION, PARALLEL COMPUTING, DIGITAL LIBRARY", } @Article{chen.ea:semantics-based-information:92, author = "H. Chen and K. Lynch", title = "Semantics-Based Information Management and Retrieval: {A} Knowledge Discovery Approach", journal = "IEEE Transactions on Systems, Man, and Cybernetics", publisher = "IEEE", month = "Forthcoming", year = "1992", abstract = "We report results of a study that involved the creation of knowledge bases from large, operational textual databases. Two East-bloc computing knowledge bases, both based on semantic network structure, were created automatically using two statistical algorithms. With the help of four East-bloc computing experts, we evaluated the two knowledge bases in detail in a concept-association experiment bases on recall and recognition tests. In our experiment, one of the knowledge bases that exhibited the asymmetric link property out-performed all four experts in recalling relevant concepts in East-bloc computing. The knowledge base, which contained about 20,000 concepts (nodes) and 280,000 weighted relationships (links), was incorporated as a thesauras-like component into an intelligent retrieval system. The system allowed users to perform semantics-based information management and information retrieval via interactive, conceptual relevance feedback. Current research efforts include development of a meta knowledge base and design of semantic network and neural network based inferencing algorithms.", } @InProceedings{cherkauer.ea:growing-simpler:96, title = "Growing Simpler Decision Trees to Facilitate Knowledge Discovery", pages = "315", author = "Kevin J. Cherkauer and Jude W. Shavlik", crossref = "simoudis.ea:proceedings-second:96", } @Article{cheung.ea:efficient-association:96, author = "D. W. Cheung and V. T. Ng and A. W. Fu and Y. J. Fu", address = "Univ Hong Kong, Dept Comp Sci, Hong Kong, Hong Kong Hong Kong Polytech Univ, Dept Comp, Hong Kong, Hong Kong Chinese Univ Hong Kong, Dept Comp Sci \& Engn, Hong Kong, Hong Kong Simon Fraser Univ, Sch Comp Sci, Burnaby, Bc V5A 1S6, Canada", title = "Efficient mining of association rules in distributed databases", journal = "Ieee Trans. On Knowledge And Data Engineering", year = "1996", month = dec, volume = "8", issue = "6", pages = "911--922", abstract = "Many sequential algorithms have been proposed for mining of association rules. However, very little work has been done in mining association rules in distributed databases. A direct application of sequential algorithms to distributed databases is not effective, because it requires a large amount of communication overhead. In this study, an efficient algorithm, DMA, is proposed. It generates a small number of candidate sets and requires only O(n) messages for support count exchange for each candidate set, where n is the number of sites in a distributed database. The algorithm has been implemented on an experimental test bed and its performance is studied. The results show that DMA has superior performance when comparing with the direct application of a popular sequential algorithm in distributed databases.", keywords = "data mining, knowledge discovery, distributed data mining, association rule, distributed database, distributed algorithm, partitioned database", } @InProceedings{cheung.ea:maintenance-discovered:96, title = "Maintenance of Discovered Knowledge: {A} Case in Multi-Level Association Rules", pages = "307", author = "David W. Cheung and Vincent T. Ng and Benjamin W. Tam", crossref = "simoudis.ea:proceedings-second:96", } @InProceedings{cheung.ea:rule-based-attribute-oriented:94a, key_modifier = "a", author = "D. W.-l. Cheung and A. W.-C. Fu and J. Han", title = "Knowledge discovery in databases: a rule-based attribute-oriented approach", pages = "164--173", editor = "Zbigniew W. Ra{\'s} and Maria Zemankova", booktitle = "Proceedings of the 8th International Symposium on Methodologies for Intelligent Systems", month = oct, series = "LNAI", volume = "869", publisher = "Springer", address = "Berlin", year = "1994", } @Article{cheung.ea:rule-based-attribute-oriented:94b, key_modifier = "b", author = "D. W.-I. Cheung and A. W.-C. Fu and J. Han", title = "Knowledge discovery in databases: a rule-based attribute-oriented approach", journal = "Lecture Notes in Computer Science", volume = "869", pages = "164--??", year = "1994", ISSN = "0302-9743", } @InProceedings{chien.ea:using-artificial:97, title = "Using Artificial Intelligence Planning to Automate Science Data Analysis for Large Image Databases", author = "Steve Chien and Forest Fisher and and Helen Mortensen and Edisanter Lo and Ronald Greeley", pages = "147", crossref = "heckerman.ea:proceedings-third:97", } @InCollection{choenni.ea:framework-query:96, author = "R. Sunil Choenni and Arno P. J. M. Siebes", title = "A framework for query optimization to support data mining", publisher = "Centrum voor Wiskunde en Informatica (CWI)", ISSN = "ISSN 0169-118X", month = oct # " 31", year = "1996", keywords = "data mining systems, search strategies, query optimization, physical database design.", URL = "ftp://ftp.cwi.nl/pub/CWIreports/AA/CS-R9637.ps.Z", abstract = "In order to extract knowledge from databases, data mining algorithms heavily query the databases. Inefficient processing of these queries will inevitably have its impact on the performance of these algorithms, making them less valuable. In this paper, we describe an optimization framework for an efficient processing of queries generated by different data mining algorithms. In this framework, we show how to take advantage of the physical organization of the database, the operators and the control structures used in an algorithm. Finally, we discuss how our framework fits into conventional query optimization frameworks.", note = "AA (Department of Algorithmics and Architecture)", annote = "originally contained the following fields and values - booktitle, 105 note, CS-R9637", } @InCollection{choenni.ea:on-multi-query:96, author = "R. (Sunil) Choenni and Martin L. Kersten and Johan F. P. van den Akker and Amani Saad", title = "On multi-query optimization", pages = "19", publisher = "Centrum voor Wiskunde en Informatica (CWI)", address = "ISSN 0169-118X", month = oct # " 31", year = "1996", keywords = "multi-query optimization, architectures, exploiting interdependencies between queries.", URL = "ftp://ftp.cwi.nl/pub/CWIreports/AA/CS-R9638.ps.Z", abstract = "In some key database applications, such as data mining, a sequence of interdependent queries may be posed simultaneously to the DBMS. The optimization of such sequences is called multi-query optimization, and it attempts to exploit these dependencies in the derivation of a query evaluation plan (qep). Although it has been observed and demonstrated by several researchers that exploitation of dependencies speed up the query processing, limited research has been reported how to benefit from multi-query optimization, taking the capabilities of existing query optimizers into account. This is exactly the topic of this paper. Since existing optimizers are able to optimize queries in which a restricted number of basic operations appears, e.g., number of joins is limited to 10, and the optimization of a query is relatively expensive, we attempt to profit from multi query optimization under the condition that queries are passed only once and separately to the optimizer. We propose a two-step optimization procedure. In the first step, we determine, on the basis of the dependencies between queries, in which order they should be specified and what results should be stored. In the second step, each query is passed separately to an optimizer.", note = "AA (Department of Algorithmics and Architecture)", annote = "originally contained the following fields and values - note, CS-R9638, booktitle, 143", } @InProceedings{ciesielski.ea:using-hybrid:96, title = "Using a Hybrid Neural/Expert System for Data Base Mining in Market Survey Data", pages = "38", author = "Victor Ciesielski and Gregory Palstra", crossref = "simoudis.ea:proceedings-second:96", } @InProceedings{clancey:classification-problem:84, title = "Classification Problem Solving", author = "W. J. Clancey", editor = "R. J. Brachman", booktitle = "Proceedings of the National Conference on Artificial Intelligence", address = "Austin, Texas", month = aug, year = "1984", publisher = "William Kaufmann", pages = "49--55", } @Article{clark.ea:cn2-induction:89, author = "Peter Clark and Tim Niblett", title = "The {CN2} Induction Algorithm", journal = "Machine Learning", year = "1989", volume = "3", pages = "261--283", } @InCollection{clark:representation-machine:89, author = "Peter Clark", title = "Knowledge Representation in Machine Learning", editor = "Yves Kodratoff and Alan Hutchinson", booktitle = "Machine and Human Learning, advances in European Research", publisher = "Michael Horwood", year = "1989", pages = "35--49", address = "London", } @InProceedings{clifton.ea:security-privacy:96, author = "Chris Clifton and Don Marks", title = "Security and Privacy Implications of Data Mining", booktitle = "Workshop on Data Mining and Knowledge Discovery", address = "Montreal, Canada", organization = "ACM SIGMOD", year = "1996", publisher = "University of British Columbia Department of Computer Science", number = "96-08", pages = "15--19", month = jun # " 2", URL = "ftp://ftp.fas.sfu.ca/pub/cs/han/dmkd96/p15.ps", contributedby = "clifton(at)mitre.org", } @InProceedings{cohen.ea:overfitting-explained:97, author = "P. R. Cohen and D. Jensen", title = "Overfitting Explained", booktitle = "Preliminary Papers of the Sixth International Workshop on Artificial Intelligence and Statistics", year = "1997", month = jan, pages = "115--122", abstract = "Overfitting arises when model components are evaluated against the wrong reference distribution. Most modeling algorithms iteratively find the best of several components and then test whether this component is good enough to add to the model. We show that for independently distributed random variables, the reference distribution for any one variable underestimates the reference distribution for the the highest-valued variable; thus variate values will appear significant when they are not, and model components will be added when they should not be added. We relate this problem to the well-known statistical theory of multiple comparisons or simultaneous inference.", abstract_url = "http://eksl-www.cs.umass.edu/~jensen/papers/ais97b.html", URL = "http://www-eksl.cs.umass.edu/papers/cohen-ais96b.ps", } @InProceedings{cohen.ea:role-development:94, author = "David Cohen and L. Berke and P. Bloom and D. Cohen and D. Tsur", title = "The Role of Knowledge Mining in the Development and Evolution of New Applications", pages = "166--167", editor = "Ahmed K. Elmagarmid and Erich Neuhold", booktitle = "Proceedings of the 10th International Conference on Data Engineering", address = "Houston, TX", month = feb, year = "1994", publisher = "IEEE Computer Society Press", } @InProceedings{compton.ea:context-strategy:88, author = "P. Compton and R. Jansen", title = "Knowledge in context: a strategy for expert system maintenance", booktitle = "Proceedings of the 2nd {A}ustralian Joint Artificial Intelligence conference", address = "Adelaide", year = "1988", publisher = "Springer", series = "Lecture Notes in Artificial Intelligence", volume = "406", pages = "292--306", } @Article{conklin.ea:molecular:93, author = "D. Conklin and S. Fortier and J. Glasgow", address = "Queens Univ, Dept Comp \& Informat Sci, Kingston K7L 3N6, On, Canada Queens Univ, Dept Chem, Kingston K7L 3N6, On, Canada", title = "Knowledge discovery in molecular databases", journal = "Ieee Trans. On Knowledge And Data Engineering", year = "1993", volume = "5", issue = "6", pages = "985--987", abstract = "This paper describes an approach to knowledge discovery in complex molecular databases. The machine learning paradigm used is structured concept formation, in which objects described in terms of components and their interrelationships are clustered and organized in a knowledge base. Symbolic images are used to represent classes of structured objects. A discovered molecular knowledge base is successfully used in the interpretation of a high resolution electron density map.", keywords = "PROTEIN, CASE-BASED REASONING, CHEMICAL INFORMATION RETRIEVAL, CONCEPTUAL CLUSTERING, DESCRIPTION LOGICS, INDEXING, RELATIONAL MODELS, SCENE ANALYSIS, SPATIAL CONCEPTS, SPATIAL REASONING, STRUCTURED CONCEPT FORMATION", } @Article{conklin:machine-protein:95, author = "D. Conklin", address = "Zymogenet Inc, 1201 Eastlake Ave E, Seattle, Wa, 98102", title = "Machine discovery of protein motifs", journal = "Machine Learning", year = "1995", volume = "21", issue = "1-2", pages = "125--150", abstract = "The investigation of relations between protein tertiary structure and amino acid sequence is a topic of tremendous importance in molecular biology. The automated discovery of recurrent patterns of structure and sequence is an essential part of this investigation. These patterns, known as protein motifs, are abstractions of fragments drawn from proteins of known sequence and tertiary structure. This paper has two objectives. The first is to introduce and define protein motifs, and provide a survey of previous research on protein motif discovery. The second is to present and apply a novel approach to protein motif representation and discovery, which is based on a spatial description logic and the symbolic machine learning paradigm of structured concept formation. A large database of protein fragments is processed using this approach, and several interesting and significant protein motifs are discovered.", keywords = "SECONDARY STRUCTURE, SEQUENCE PATTERNS, PREDICTIVE POWER, IDENTIFICATION, RECOGNITION, GENERATION, DEFINITION, TEMPLATES, SETS, PROTEIN TERTIARY STRUCTURE, MACHINE DISCOVERY, RELATIONAL LEARNING, KNOWLEDGE REPRESENTATION, DESCRIPTION LOGICS, INFORMATION RETRIEVAL, KNOWLEDGE DISCOVERY IN DATABASES", } @Article{cook.ea:scalable-informative:96, author = "D. J. Cook and L. B. Holder and S. Djoko", address = "Univ Texas, Dept Comp Sci \& Engn, Arlington, Tx, 76019 Bell No Res, Sci Staff, Richardson, Tx", title = "Scalable discovery of informative structural concepts using domain knowledge", journal = "Ieee Expert-Intelligent Systems \& Their Applications", year = "1996", volume = "11", issue = "5", pages = "59--68", } @Article{cook.ea:substructure-using:94, author = "D. J. Cook and L. B. Holder", title = "Substructure Discovery Using Minimum Description Length and Background Knowledge", journal = "JAIR", year = "1994", volume = "1", pages = "231--255", abstract = "The ability to identify interesting and repetitive substructures is an essential component to discovering knowledge in structural data. We describe a new version of our SUBDUE substructure discovery system based on the minimum description length principle. The SUBDUE system discovers substructures that compress the original data and represent structural concepts in the data. By replacing previously-discovered substructures in the data, multiple passes of SUBDUE produce a hierarchical description of the structural regularities in the data. SUBDUE uses a computationally-bounded inexact graph match that identifies similar, but not identical, instances of a substructure and finds an approximate measure of closeness of two substructures when under computational constraints. In addition to the minimum description length principle, other background knowledge can be used by SUBDUE to guide the search towards more appropriate substructures. Experiments in a variety of domains demonstrate SUBDUE's ability to find substructures capable of compressing the original data and to discover structural concepts important to the domain.", annote = "The SUBDUE system discovers substructures that compress the original data and represent structural concepts in the data. By replacing previously-discovered substructures in the data, multiple passes of SUBDUE produce a hierarchical description of the structural regularities in the data.", URL = "gopher://P.GP.CS.CMU.EDU:70/00/volume1/cook94a.ps", } @InProceedings{cornish.ea:what-has:95, author = "Tremaine A. O. Cornish and Anthony D. Elliman", title = "What has Mill to Say About Data Mining ?", pages = "347--353", booktitle = "Proceedings of the Eleventh Conference on Artificial Intelligence for Applications", month = "20--2~" # feb, publisher = "IEEE Computer Society Press", address = "Los Alamitos", year = "1995", } @Article{cornish:historical-perspectives:96, author = "T. A. O. Cornish", address = "Brunel Univ, Dept Comp Sci \& Informat Syst, Uxbridge Ub8 3Ph, Middx, England", title = "Historical perspectives on information-science", journal = "Systems Research And Information Science", year = "1996", volume = "7", issue = "2", pages = "105--116", abstract = "There is a general attitude in science and particularly computer science, that if something is more than five year old, then we have nothing to learn from it. This paper seeks first to destroy the basis of this myth with reference to areas of current research which are still striving to live up to visions set many years ago. Secondly to look at an area of research, Knowledge Discovery in Databases and demonstrate that it to has a great deal to learn from the distant past, which has been all but overlooked.", keywords = "KNOWLEDGE DISCOVERY, SYSTEMATIC, SCIENTIFIC, DATA MINING, HISTORICAL, INFORMATION, SYSTEMS", } @InProceedings{cromp.ea:multi-dimensional-remotely:93, author = "Robert F. Cromp and William J. Campbell", title = "Data Mining of Multi-dimensional Remotely Sensed Images", pages = "471--480", editor = "Bharat Bhargava and Timothy Finin and Yelena Yesha", booktitle = "Proceedings of the 2nd International Conference on Information and Knowledge Management", month = nov, publisher = "ACM Press", address = "New York, NY, USA", year = "1993", } @Article{cupit.ea:exploiting-knowledge-level:96a, key_modifier = "a", author = "J. Cupit and N. Shadbolt", title = "Knowledge Discovery in Databases: Exploiting Knowledge-Level Redescription", journal = "Lecture Notes in Computer Science", volume = "1076", pages = "245--??", year = "1996", ISSN = "0302-9743", } @InProceedings{cupit.ea:exploiting-knowledge-level:96b, key_modifier = "b", author = "James Cupit and Nigel Shadbolt", title = "Knowledge Discovery in Databases: Exploiting Knowledge-Level Redescription", pages = "245--261", editor = "Nigel Shadbolt and Kieron O'Hara and Schreiber Guus", booktitle = "Proceedings of the Nineth European Knowledge Acquisition Workshop ({EKAW}-96)", month = may # "14--17~", series = "LNAI", volume = "1076", publisher = "Springer", address = "Berlin", year = "1996", } @InProceedings{czyzewski:noisy-audio:96, title = "Mining Knowledge in Noisy Audio Data", pages = "220", author = "Andrzej Czyzewski", crossref = "simoudis.ea:proceedings-second:96", } @InProceedings{davies.ea:distributed-learning:95, title = "Distributed learning: {A}n agent-based approach to data-mining", author = "Winton Davies and Peter Edwards", booktitle = "Working Notes of the ICML '95 Workshop on Agents that Learn from Other Agents", year = "1995", address = "Tahoe City, CA", editor = "Diana Gordon", } @Article{debska.ea:infrared-database:97, author = "B. J. Debska and B. Guzowskaswider", address = "Rzeszow Univ Technol, Dept Comp Chem, 6 Powstancow Warszawy Av, Pl-35041 Rseszow, Poland", title = "Knowledge discovery in an infrared database", journal = "Computers \& Chemistry", year = "1997", volume = "21", issue = "1", pages = "51--59", abstract = "The paper describes a process of knowledge acquisition in the collection of infrared spectra (infrared database). In fact it is a strategy for the automated generation of correlation tables, i.e. correlations between specific molecular subunits (substructures, chemical groups) and their absorption frequencies. The data in the tables are subsequently converted automatically into rules that can be used to infer the existence of molecular substructures from the IR spectrum of an analysed compound. Copyright (C) 1996 Elsevier Science Ltd", keywords = "SYSTEM, STRUCTURE IDENTIFICATION, SPECTROSCOPY METHODS, KNOWLEDGE DISCOVERY, RULE KNOWLEDGEBASE", } @TechReport{decker.ea:technology-overview:94, URL = "http://www.cscs.ch/Official/PubTR95.html", title = "Technology Overview: {A} Report on Data Mining", author = "K. Decker and S. Focardi", month = feb, year = "1994", } @InProceedings{decoste:multivariate-time-series:97, title = "Mining Multivariate Time-Series Sensor Data to Discover Behavior Envelopes", author = "Dennis DeCoste", pages = "151", crossref = "heckerman.ea:proceedings-third:97", } @Article{dejesus:theres-gold:95, author = "E. X. Dejesus", address = "Univ Bath, Sch Math Sci, Bath Ba2 7Ay, Avon, England", title = "Data Mining --- There's gold in those hills of data", journal = "Byte", year = "1995", volume = "20", issue = "10", pages = "81--81", } @Article{deraedt.ea:clausal:97, author = "L. Deraedt and L. Dehaspe", address = "Katholieke Univ Leuven, Dept Comp Sci, Celestijnenlaan 200A, B-3001 Heverlee, Belgium", title = "Clausal discovery", journal = "Machine Learning", year = "1997", volume = "26", issue = "2-3", pages = "99--146", abstract = "The clausal discovery engine CLAUDIEN is presented. CLAUDIEN is an inductive logic programming engine that fits in the descriptive data mining paradigm. CLAUDIEN addresses characteristic induction from interpretations, a task which is related to existing formalisations of induction in logic. In characteristic induction from interpretations, the regularities are represented by clausal theories, and the data using Herbrand interpretations. Because CLAUDIEN uses clausal logic to represent hypotheses, the regularities induced typically involve multiple relations or predicates. CLAUDIEN also employs a novel declarative bias mechanism to define the set of clauses that may appear in a hypothesis.", keywords = "inductive logic programming, knowledge discovery in databases, data mining, learning, induction, semantics for induction, logic of induction, parallel learning", } @InProceedings{derthick.ea:interactive-environment:97, title = "An Interactive Visualization Environment for Data Exploration", author = "Mark Derthick and John Kolojejchick and Steven F. Roth", pages = "2", crossref = "heckerman.ea:proceedings-third:97", abstract = " Exploratory data analysis is a process of sifting through data in search of interesting information or patterns. Analysts' current tools for exploring data include database management systems, statistical analysis packages, data mining tools, visualization tools, and report generators. Since the exploration process seeks the unexpected in a data-driven manner, it is crucial that these tools are seamlessly integrated so analysts can flexibly select and compose tools to use at each stage of analysis. Few systems have integrated all these capabilities either architecturally or at the user interface level. Visage's information-centric approach allows coordination among multiple application user interfaces. It uses an architecture that keeps track of the mapping of visual objects to information in shared databases. One result is the ability to perform direct manipulation operations such as drag-and-drop transfer of data among applications. This paper describes Visage's Visual Query Language and visualization tools, and illustrates their application to several stages of the exploration process: creating the target dataset, data cleaning and preprocessing, data reduction and projection, and visualization of the reduced data. Unlike previous integrated KDD systems' interfaces, direct manipulation is used pervasively, and the visualizations are more diverse and can be customized automatically as needed. Coordination among all interface objects simplifies iterative modification of decisions at any stage.", URL = "http://www.cs.cmu.edu/~sage/KDD97/KDD97/KDD97.html", url2 = "http://www.cs.cmu.edu/~sage/KDD97/KDD97.PS.GZ", } @Article{dhar.ea:abstract-driven-pattern:93, author = "V. Dhar and A. Tuzhilin", title = "Abstract-Driven Pattern Discovery in Databases", journal = "IEEE Transactions on Knowledge and Data Engineering", pages = "926--938", volume = "5", number = "6", month = dec, year = "1993", } @InCollection{dietterich.ea:comparative-review:83, author = "Thomas G. Dietterich and Ryszard S. Michalski", title = "A comparative review of selected methods for learning from examples", pages = "41--81", crossref = "michalski.ea:machine-learning:83", } @Article{dietterich.ea:comparison-id3:95, author = "T. G. Dietterich and H. Hild and G. Bakiri", title = "A comparison of {ID3} and backpropagation for English text-to-speech mapping. (Preprint)", year = "1995", URL = "ftp://ftp.cs.orst.edu/users/t/tgd/papers/mlj-nettalk.ps.gz", } @InProceedings{domingos:efficient-specific-to-general:96, title = "Efficient Specific-to-General Rule Induction", pages = "319", author = "Pedro Domingos", crossref = "simoudis.ea:proceedings-second:96", } @InProceedings{domingos:linear-time-rule:96, title = "Linear-Time Rule Induction", pages = "96", author = "Pedro Domingos", crossref = "simoudis.ea:proceedings-second:96", } @InProceedings{domingos:why-does:97, title = "Why Does Bagging Work? {A} Bayesian Account and its Implications", author = "Pedro Domingos", pages = "155", crossref = "heckerman.ea:proceedings-third:97", } @Article{dou:chemistry-facing:96, author = "H. Dou", address = "Univ Aix Marseille 3, Crrm, Ctr St Jerome, F-13397 Marseille 20, France", title = "Chemistry facing the phenomena of data mining idea mining and knowledge recovery", journal = "Analusis", year = "1996", volume = "24", issue = "2", pages = "M 8--M 12", keywords = "LAW", } @Article{drobnic.ea:use-artificial-intelligence:96, author = "M. Drobnic and M. Mozetic and T. Mozetic and M. Gams", address = "Jozef Stefan Inst, Jamova 39, Ljubljana 1001, Slovenia Inst Surface Engn \& Optoelect, Ljubljana 1001, Slovenia High Med Coll, Ljubljana 1001, Slovenia", title = "Use of artificial-intelligence techniques for the description of processes in ni/al multilayers", journal = "Surface \& Coatings Technology", year = "1996", volume = "84", issue = "1-3", pages = "491--494", abstract = "Knowledge discovery is a novel research area in the field of artificial intelligence. Its aim is to discover empirical laws that govern the behavior of complex systems using measurements of system variables. In this paper a brief description of the GOLDHORN knowledge discovery system is presented. GOLDHORN discovers differential equations and has features for handling noisy data, including some digital filters. In the present case, this method was used to describe analytically atomic migration in thin layers. A multilayer structure of nickel and aluminum was deposited on a copper substrate using the triode sputtering system and hollow cathode CVD plasma deposition. The composition of the elements in the deposited layers was determined by Auger electron spectroscopy (AES). The structure was then annealed for different times. After annealing, the samples were analyzed again. The AES data were then analyzed by the GOLDHORN software package in order to obtain an analytical description of atomic migration as a function of the relative concentration of elements in a layer. The analysis shows that the rate of migration of Al in Ni depends on the relative concentrations of the elements. Different phases appeared to be indicated via the changes in the slope of the curve. Our results show that knowledge discovery is a very useful tool for analyzing complex processes such as atomic migration in multilayer systems.", keywords = "INTERFACE, MULTILAYER STRUCTURES, KNOWLEDGE DISCOVERY", } @InProceedings{drucker:fast-committee:97, title = "Fast Committee Machines for Regression and Classification", author = "Harris Drucker", pages = "159", crossref = "heckerman.ea:proceedings-third:97", } @Book{duran.ea:cluster-analysis:74, author = "Benjamin S. Duran and Patrick L. Odell", title = "Cluster analysis: a survey", year = "1974", publisher = "Spinger-Verlag", series = "Lecture Notes in Economics and Mathematical Systems", volume = "100", } @InProceedings{dzeroski.ea:discovering-dynamics:93, author = "S. D\v{z}eroski and L. Todorovski", title = "Discovering dynamics", booktitle = "Proceedings of the AAAI-93 Workshop on Knowledge Discovery in Databases", pages = "125--137", publisher = "AAAI Press", year = "1993", } @InCollection{dzeroski:inductive-logic:95, author = "S. D\v{z}eroski", title = "Inductive logic programming and knowledge discovery in databases", editor = "U. Fayyad and G. Piatetsky-Shapiro and P. Smyth and R. Uthurusamy", booktitle = "Advances in Knowledge Discovery and Data Mining", pages = "118--152", year = "1995", publisher = "The MIT Press", } @InProceedings{eick.ea:interactive-at:95, author = "Stephen G. Eick and Brian S. Johnson", title = "Interactive Data Visualization at {AT}\&{T} Bell Labs", booktitle = "Proceedings of ACM CHI'95 Conference on Human Factors in Computing Systems", URL = "http://www.acm.org/sigchi/chi95/proceedings/demos/bsj\_bdy.htm", series = "Demonstrations: Visualization", volume = "2", pages = "17--18", year = "1995", copyright = "(c) Copyright 1995 Association for Computing Machinery", keywords = "Visualization, Graphic interaction, Abstract data visualization, Database visualization, Data mining", abstract = "Visualization is a key technology for understanding large bodies of data. Our approach to visualizing abstract, non-geometric data involves a reduced-representation overview, multiple linked views, filtering and focusing techniques to reduce visual clutter, color, and a highly-interactive user interface. The reduced representations allow users to see the entire data set in one view while still providing immediate access to relevant detail and answers to specific questions in the linked views. We have developed a software infrastructure embodying our design principles for producing novel, high-bandwidth visualizations of corporate datasets. Our approach to abstract data visualization is one the best off-ramps on the information superhighway.", } @Article{eisenberg:essay-anne:96, author = "Anne Eisenberg", title = "Essay: Anne Eisenberg --- Data mining and privacy invasion on the Net", journal = "Scientific American", volume = "274", number = "3", pages = "120--??", month = mar, year = "1996", ISSN = "0036-8733", } @Article{elomaa:defence-c4:, URL = "ftp://ftp.cs.helsinki.fi/pub/Reports/by_Project/PMDM/In_Defence_of_C4.5__Notes_on_Learning_One-Level_Decision_Trees.ps.gz", author = "Tapio Elomaa", title = "In Defence of {C4}.5 Notes on Learning One-Level Decision Trees", note = "To appear in W. Cohen \& H. Hirsh (eds.), Machine Learning: Proceedings of the Eleventh International Conference.(New Brunswick NJ, July 1994.) Morgan Kaufmann, San Francisco CA.", abstract = "We discuss the implications of Holte's recently published article, which demonstrated that on the most commonly used data very simple classification rules are almost as accurate as decision trees produced by Quinlan's C4.5. We consider, in particular, what is the significance of Holte's results for the future of top-down induction of decision trees. To an extent, Holte questioned the sense of further research on multilevel decision tree learning. We go in detail through all the parts of Holte's study. We try to put the results into perspective. We argue that the (in absolute terms) small difference in accuracy between 1R and C4.5 that was witnessed by Holte is still significant. We claim that C4.5 possesses additional accuracy-related advantages over 1R. In addition we discuss the representativeness of the databases used by Holte. We compare empirically the optimal accuracies of multilevel and one-level decision trees and observe some significant differences. We point out several deficiencies of limited-complexity classifiers.", } @Article{emde.ea:uberblick:96, author = "Werner Emde and Dierich Wettschereck and Stefan Wrobel", title = "Data Mining - Ein {\"U}berblick", journal = "Unix/Mail", year = "1996", note = "to appear", } @InProceedings{engels.ea:guided-tour:97, title = "A Guided Tour through the Data Mining Jungle", author = "Robert Engels and Guido Lindner and Rudi Studer", pages = "163", crossref = "heckerman.ea:proceedings-third:97", } @InProceedings{engels:planning-tasks:96, title = "Planning Tasks for Knowledge Discovery in Databases; Performing Task-Oriented User-Guidance", pages = "170", author = "Robert Engels", crossref = "simoudis.ea:proceedings-second:96", } @TechReport{escher:evaluation-und:97, author = "Stefan Escher", title = "Evaluation und Erweiterung eines Verfahrens zum Finden von Regelmaessigkeiten in relationalen Datenbanken", institution = "Universitaet Stuttgart, Fakultaet Informatik, Germany", number = "DIP-1444", month = jan # " 1", year = "1997", keywords = "ILP, Data Mining, Knowledge Discovery in Databases", URL = "ftp://ftp.informatik.uni-stuttgart.de/pub/library/ncstrl.ustuttgart_fi/DIP-1444/DIP-1444.ps.gz", abstract = "In den letzten Jahren wurden die Techniken zur Datenerhebung und Speicherung stark weiterentwickelt. Zum Beispiel fuehren Barcodes auf nahezu allen Produkten und die Automatisierung von Betriebsablaeufen zu immer groesseren Datenmengen, die interpretiert werden muessen. Das Problem liegt darin, dass eine grosse Menge von Information vorhanden ist, das darin enthaltene Wissen jedoch aufgrund der grossen Datenmenge nicht zugaenglich ist. Daraus ergibt sich die Notwendigkeit zur Entdeckung von Wissen in grossen Datenbanken (Knowledge Discovery in Databases, Data Mining). Grundlage des in dieser Diplomarbeit vorgestellten Verfahrens ist das angenaeherte nichtmonotone ILP (Inductive Logic Programming). Gefunden werden Hornformeln, wobei eine Menge von Rumpfliteralen vom Benutzer angegeben werden muss. Die Qualitaet von gefundenen Klauseln wird von den Messwerten Support und Confidence bestimmt. Hauptsaechlich beschaeftigt sich die Diplomarbeit mit der Erweiterung eines bestehenden ILP-Verfahrens um eine Komponente, die numerische Attribute behandeln kann", } @InProceedings{esposito.ea:refinement-datalog:96, author = "F. Esposito and A. Laterza and D. Malerba and G. Semeraro", title = "Refinement of {D}atalog Programs", booktitle = "Proceedings of the MLnet Familiarization Workshop on Data Mining with Inductive Logic Programing", pages = "73--94", year = "1996", } @InProceedings{ester.ea:density-based-algorithm:96, title = "A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise", pages = "226", author = "Martin Ester and Hans-Peter Kriegel and Jorg Sander and Xiaowei Xu", crossref = "simoudis.ea:proceedings-second:96", } @InProceedings{ester.ea:density-connected-sets:97, title = "Density-Connected Sets and their Application for Trend Detection in Spatial Databases", author = "Martin Ester and Hans-Peter Kriegel and J{\"{o}}rg Sander and Xiaowei Xu", pages = "10", crossref = "heckerman.ea:proceedings-third:97", } @Article{ester.ea:large-spatial:95, author = "M. Ester and H. P. Kriegel and X. W. Xu", address = "Univ Munich, Inst Comp Sci, Leopoldstr 11B, D-80802 Munich, Germany", title = "Knowledge discovery in large spatial databases - focusing techniques for efficient class identification", journal = "Lecture Notes In Computer Science", year = "1995", volume = "951", pages = "67--82", abstract = "Both, the number and the size of spatial databases are rapidly growing because of the large amount of data obtained from satellite images, X-ray crystallography or other scientific equipment. Therefore, automated knowledge discovery becomes more and more important in spatial databases. So far, most of the methods for knowledge discovery in databases (KDD) have been based on relational database systems. In this paper, we address the task of class identification in spatial databases using clustering techniques. We put special emphasis on the integration of the discovery methods with the DB interface, which is crucial for the efficiency of KDD on large databases. The key to this integration is the use of a well-known spatial access method, the R*-tree. The focusing component of a KDD system determines which parts of the database are relevant for the knowledge discovery task. We present several strategies for focusing: selecting representatives from a spatial database, focusing on the relevant clusters and retrieving all objects of a given cluster. We have applied the proposed techniques to real data from a large protein database used for predicting protein-protein docking. A performance evaluation on this database indicates that clustering on large spatial databases can be performed, both, efficiently and effectively.", keywords = "PROTEIN", } @Article{etal:discovering-functional:92, crossref = "ijis-special-issue:92", author = "Martti Kantola etal.", title = "Discovering Functional and Inclusion Dependancies in Relational Databases", pages = "591--607", } @Article{evans.ea:overcoming-process:94, author = "Bob Evans and Doug Fisher", title = "Overcoming Process Delays with Decision Tree Induction", journal = "IEEE Expert", year = "1994", pages = "60--66", month = feb, keywords = "Knowledge Acquisition, Decision Trees, ID3", } @InProceedings{evans:clarit:95, author = "David A. Evans", title = "{CLARIT}", booktitle = "Proceedings of the Eighteenth Annual International ACM SIGIR Conference on Research and Development in Information Retrieval", series = "Systems Demonstrations: Abstracts", pages = "360", year = "1995", copyright = "(c) Copyright 1995 Association for Computing Machinery", abstract = "The CLARIT system consists of a set of flexible tools for application in a wide range of information management problems. These tools integrate natural-language processing (NLP), automatic knowledge discovery, and traditional information retrieval techniques. An advanced functionality application for free-text database management is demonstrated, incorporating full NLP, a broad range of querying mechanisms, automatic or user controlled query expansion, document collection profiling, document summarization, automatic document classification, and integrated handling of scanned images. The application provides rapid analysis of potentially large queries over large-scale databases in monolithic or client/server processing modes.", } @Article{ezawa.ea:constructing-bayesian:96, author = "K. J. Ezawa and S. W. Norton", address = "At\&T Bell Labs, Consumer Lab, Tech Staff, 600 Mt Ave, Rm 7E-523, Murray Hill, Nj, 07974", title = "Constructing bayesian networks to predict uncollectible telecommunications accounts", journal = "Ieee Expert-Intelligent Systems \& Their Applications", year = "1996", volume = "11", issue = "5", pages = "45--51", keywords = "EXPERT-SYSTEMS", } @TechReport{fahner:interaction-selection:96, author = "Gerald Fahner", title = "Interaction Selection and Complexity Control for Learning in Binarized Domains", institution = "International Computer Science Institute", number = "TR-96-001", address = "Berkeley, CA", month = may, year = "1996", keywords = "learning algorithms, feature selection, Walsh-functions, input-space representation, complexity measures, capacity control, model comparison", abstract = "We empirically investigate the potential of a novel, greatly simplified classifier design for binarized data. The generic model allocates a sparse, _digital_ hidden layer comprised of interaction nodes that compute PARITY of selected submasks of input bits, followed by a sigmoidal output node with adjustable weights. Model identification incorporates user-assigned complexity preferences. We discuss the situations: a) when the input space obeys a metrics b) when the inputs are discrete attributes We propose a family of respective model priors that make search through the combinatorial space of multi-input interactions feasible. Model capacity and smoothness of the approximation are controlled by two complexity parameters. Model comparison over the parameter plane discovers models with excellent performance. In some cases interpretable structures are achieved. We point out the significance of our novel data mining tool for overcoming scaling problems, impacts on real-time systems, and possible contributions to the development of non-standard computing devices for inductive inference.", } @InProceedings{fahner:with-sparse:96, title = "Data Mining with Sparse and Simplified Interaction Selection", pages = "359", author = "Gerald Fahner", crossref = "simoudis.ea:proceedings-second:96", } @InProceedings{faloutsos.ea:fastmap-fast:95a, key_modifier = "a", title = "{FastMap}: {A} Fast Algorithm for Indexing, Data-Mining and Visualization of Traditional and Multimedia Datasets", author = "Christos Faloutsos and King-Ip Lin", editor = "Michael J. Carey and Donovan A. Schneider", booktitle = "Proceedings of the 1995 {ACM} {SIGMOD} International Conference on Management of Data", address = "San Jose, California", month = "22--25~" # may, year = "1995", pages = "163--174", } @Article{faloutsos.ea:fastmap-fast:95b, key_modifier = "b", author = "C. Faloutsos and King-Ip Lin", title = "{FastMap}: a fast algorithm for indexing, data-mining and visualization of traditional and multimedia datasets", journal = "SIGMOD Record (ACM Special Interest Group on Management of Data)", volume = "24", number = "2", pages = "163--174", month = jun, year = "1995", ISSN = "0163-5808", affiliation = "AT\&T Bell Labs., Murray Hill, NJ, USA", classification = "C6160 (Database management systems (DBMS)); C6170K (Knowledge engineering techniques); C7240 (Information analysis and indexing); C6160S (Spatial and pictorial databases); C4240 (Programming and algorithm theory)", keywords = "FastMap; Fast algorithm; Indexing; Data-mining; Visualization; Multimedia datasets; Traditional datasets; Feature-extraction functions; Domain expert; Highly fine-tuned spatial access methods; Best-match query; K-dimensional space; Potential clusters; Pattern recognition", thesaurus = "Feature extraction; Indexing; Knowledge acquisition; Multimedia computing; Pattern matching", } @TechReport{faloutsos.ea:fastmap-fast:95c, key_modifier = "c", author = "Christos Faloutsos and King-Ip (David) Lin", title = "FastMap: {A} Fast Algorithm for Indexing, Data-Mining and Visualization of Traditional and Multimedia Datasets", institution = "University of Maryland Institute for Advanced Computer Studies Dept. of Computer Science, Univ. of Maryland", number = "CS-TR-3383", address = "College Park, MD", month = jan, year = "1995", URL = "ftp://ftp.cs.umd.edu/pub/papers/papers/3383/3383.ps.Z", abstract = "A very promising idea for fast searching in traditional and multimedia databases is to map objects into points in k-d space, using k feature-extraction functions, provided by a domain expert rJag91]. Thus. we can subsequently use highly fine-tuned spatia l access methods (SAMs), to answer several types of queries, including the 'Query By Example' type (which translates to a range query); the 'all pairs' query (which translates to a spatial join [BKSS94]); the nearest-neighbor or best-match query, etc. \par However, designing feature extraction functions can be hard. It is relatively easier for a domain expert to assess the similarity/distance of two objects. Given only the distance information though, it is not obvious how to map objects into points. \par This is exactly the topic of this paper. We describe a fast algorithm to map objects into points in some k-dimensional space (k is user-defined), such that the dissimilarities are preserved. There are two benefits from this mapping: (a) efficient retriev al, in conjunction with a SAM, as discussed before and (b) visualization and data-mining: the objects can now be plotted as points in 2-d or Sd space, revealing potential clusters, correlations among attributes and other regularities that data-mining is l ooking for. \par We introduce an older method from pattern recognition, namely, Multi-Dimcnsional Scaling (MDS) [Tor52]; although unsuitable for indexing, we use it as yardstick for our method. Then, we propose a much faster algorithm to solve the problem in hand, while in addition it allows for indexing. Experiments on real and synthetic data indeed show that the proposed algorithm is significantly faster than MDS, (being linear, as opposed to quadratic, on the database size N), while it manages to preserve distances an d the overall structure of the data-set. \par (Also cross-referenced as UMIACS-TR-94-132)", } @InProceedings{fawcett.ea:combining-machine:96, title = "Combining Data Mining and Machine Learning for Effective User Profiling", page = "8", author = "Tom Fawcett and Foster Provost", crossref = "simoudis.ea:proceedings-second:96", } @Book{fayyad.ea:advances:96, editor = "U. M. Fayyad and G. Piatetsky-Shapiro and P. Smyth and R. Uthurusamy", title = "Advances in Knowledge Discovery and Data Mining", publisher = "MII Press", address = "Mento Park", year = "1996", ISBN = "0-262-56097-6", descriptor = "Data Mining, Daten", } @InProceedings{fayyad.ea:attribute-selection:92, author = "U. M. Fayyad and K. B. Irani", title = "The Attribute Selection Problem in Decision Tree Generation", year = "1992", booktitle = "Proc.\ of AAAI-92", pages = "104--110", } @InProceedings{fayyad.ea:automated-cataloging:93, author = "Usama M. Fayyad and Nicholas Weir and S. Djorgovski", title = "Automated cataloging and analysis of ski survey image databases: the {SKICAT} system", booktitle = "Proc. of the second Int. Conf. on Information and Knowledge Management", address = "Washington DC", pages = "527--536", year = "1993", } @Article{fayyad.ea:data-mining:96, author = "Usama Fayyad and Ramasamy Uthurusamy", address = "Microsoft Corp, Res, Redmond, Wa, 98052 Gm Corp, Knowledge \& Decis Support, Detroit, Mi, 48202", title = "Data Mining and Knowledge Discovery in Databases", journal = "Communications of the ACM", volume = "39", number = "11", pages = "24--27", month = nov, year = "1996", ISSN = "0001-0782", } @Article{fayyad.ea:digitized-images:96, author = "U. M. Fayyad and S. G. Djorgovski and N. Weir", address = "Microsoft Res, Redmond, Ca Caltech, Jpl, Machine Learning Syst Grp, Pasadena, Ca, 91125", title = "From digitized images to online catalogs - data mining a sky survey", journal = "Ai Magazine", year = "1996", volume = "17", issue = "2", pages = "51--66", abstract = "The value of scientific digital-image libraries seldom lies in the pixels of images. For large collections of images, such as those resulting from astronomy sky surveys, the typical useful product is an online database cataloging entries of interest. We focus on the automation of the cataloging effort of a major sky survey and the availability of digital libraries in general. The SKICAT system automates the reduction and analysis of the three terabytes worth of images, expected to contain on the order of 2 billion sky objects. For the primary scientific analysis of these data, it is necessary to detect, measure, and classify every sky object. SKICAT integrates techniques for image processing, classification learning, database management, and visualization. The learning algorithms are trained to classify the detected objects and can classify objects too faint for visual classification with an accuracy level exceeding 90 percent. This accuracy level increases the number of classified objects in the final catalog threefold relative to the best results from digitized photographic sky surveys to date. Hence, learning algorithms played a powerful and enabling role and solved a difficult, scientifically significant problem, enabling the consistent, accurate classification and the ease of access and analysis of an otherwise unfathomable data set.", } @Article{fayyad.ea:scientific:96, author = "Usama Fayyad and David Haussler and Paul Stolorz", title = "Mining Scientific Data", journal = "Communications of the ACM", volume = "39", number = "11", pages = "51--57", month = nov, year = "1996", ISSN = "0001-0782", } @Article{fayyad.ea:to:96, author = "U. Fayyad and G. Piatetsky-Shapiro and P. Smyth", address = "Univ Calif Irvine, Dept Comp \& Informat Sci, Irvine, Ca, 92717 Gte Labs Inc, Knowledge Discovery Databases Kdd Project, Tech Staff, Waltham, Ma, 02254", title = "From data mining to knowledge discovery in databases", journal = "Ai Magazine", year = "1996", volume = "17", issue = "3", pages = "37--54", abstract = "Data mining and knowledge discovery in databases have been attracting a significant amount of research, industry, and media attention of late. What is all the excitement about? This article provides an overview of this emerging field, clarifying how data mining and knowledge discovery in databases are related both to each other and to related fields, such as machine learning, statistics, and databases. The article mentions particular real-world applications, specific data-mining techniques, challenges involved in real- world applications of knowledge discovery, and current and future research directions in the field.", keywords = "NEURAL NETWORKS", } @InProceedings{fayyad.ea:towards-unifying:96, title = "Knowledge Discovery and Data Mining: Towards a Unifying Framework", pages = "82", author = "Usama Fayyad and Gregory Piatetsky-Shapiro and Padhraic Smyth", crossref = "simoudis.ea:proceedings-second:96", } @InProceedings{fayyad:applications-astronomy:96a, key_modifier = "a", author = "Usama M. Fayyad", title = "Data Mining and Knowledge Discovery in Databases: Applications in Astronomy and Planetary Science (Invited Talk)", pages = "1590--1592", booktitle = "Proceedings of the Thirteenth National Conference on Artificial Intelligence and the Eighth Innovative Applications of Artificial Intelligence Conference", month = aug # "4--8~", publisher = "AAAI Press / MIT Press", address = "Menlo Park", year = "1996", } @Article{fayyad:making-sense:96b, key_modifier = "b", author = "U. M. Fayyad", address = "Microsoft Res, 1 Microsoft Way 9-S, Redmond, Wa, 98052", title = "Data mining and knowledge discovery - making sense out of data", journal = "Ieee Expert-Intelligent Systems \& Their Applications", year = "1996", volume = "11", issue = "5", pages = "20--25", } @InProceedings{feelders:learning-biased:96, title = "Learning from Biased Data Using Mixture Models", pages = "102", author = "A. J. Feelders", crossref = "simoudis.ea:proceedings-second:96", } @InProceedings{feldman.ea:associations-text:96, title = "Mining Associations in Text in the Presence of Background Knowledge", pages = "343", author = "Ronen Feldman and Haym Hirsh", crossref = "simoudis.ea:proceedings-second:96", } @InProceedings{feldman.ea:maximal-association:97, title = "Maximal Association Rules: {A} New Tool for Mining for Keyword Co-Occurrences in Document Collections", author = "Ronen Feldman and Yonatan Aumann and Amihood Amir and Willi Kloesgen Amir Zilberstein", pages = "167", crossref = "heckerman.ea:proceedings-third:97", } @InProceedings{feldman.ea:techniques-to:97, title = "Visualization Techniques to Explore Data Mining Results for Document Collections", author = "Ronen Feldman and Willi Kl{\"{o}}sgen and Amir Zilberstein", pages = "16", crossref = "heckerman.ea:proceedings-third:97", } @Article{fisher.ea:applying-ai:93, author = "Doug Fisher and etal.", title = "Applying {AI} Clustering to Engineering Tasks.", journal = "IEEE Expert", year = "1993", pages = "51--60", month = dec, keywords = "COBWEB, Clustering, Applications, Time Series", annote = "Application of COBWEB to fault diagnosis, Bridge design and human gait analysis. 17 references", } @TechReport{fisher:iterative-optimization:, URL = "http://cswww.vuse.vanderbilt.edu/~dfisher/tech-reports/tr-95-01.html", title = "Iterative Optimization and Simplification of Hierarchical Clusterings", number = "Technical Report CS-95-01", author = "Doug Fisher", abstract = "Clustering is often used for discovering structure in data. Clustering systems differ in the objective function used to evaluate clustering quality and the control strategy used to search the space of clusterings. Ideally, the search strategy should consistently construct clusterings of high quality, but be computationally inexpensive as well. In general, we cannot have it both ways, but we can partition the search so that a system inexpensively constructs a `tentative' clustering for initial examination, followed by iterative optimization, which continues to search in background for improved clusterings. Given this motivation, we evaluate an inexpensive `sorting' strategy coupled with several control strategies for iterative optimization, each of which repeatedly modifies an initial clustering in search of a better one. One of these optimization strategies, inspired by work on macro-operator learning, appears to be novel in the clustering literature. Once a clustering has been constructed it is judged by analysts -- often according to task-specific criteria. Several authors have abstracted these criteria and posited a generic performance task akin to pattern completion, where the error rate over completed patterns is used to `externally' judge clustering utility. Given this performance task we adapt resampling-based pruning strategies used by supervised learning systems to the task of simplifying hierarchical clusterings, thus promising to ease post-clustering analysis. Finally, we propose a number of objective functions, based on attribute-selection measures for decision-tree induction, that might perform well on the error rate and simplicity dimensions.", keywords = "clustering, iterative optimization, cluster validation, resampling, pruning, objective functions", } @Article{fisher:iterative-optimization:96, URL = "http://cswww.vuse.vanderbilt.edu/~dfisher/jair-96/html-final/html-final.html", title = "Iterative Optimization and Simplification of Hierarchical Clusterings", author = "Doug Fisher", year = "1996", journal = "Journal of Artificial Intelligence Research", volume = "4", pages = "147--180", abstract = " Clustering is often used for discovering structure in data. Clustering systems differ in the objective function used to evaluate clustering quality and the control strategy used to search the space of clusterings. Ideally, the search strategy should consistently construct clusterings of high quality, but be computationally inexpensive as well. In general, we cannot have it both ways, but we can partition the search so that a system inexpensively constructs a `tentative' clustering for initial examination, followed by iterative optimization, which continues to search in background for improved clusterings. Given this motivation, we evaluate an inexpensive strategy for creating initial clusterings, coupled with several control strategies for iterative optimization, each of which repeatedly modifies an initial clustering in search of a better one. One of these methods appears novel as an iterative optimization strategy in clustering contexts. Once a clustering has been constructed it is judged by analysts -- often according to task-specific criteria. Several authors have abstracted these criteria and posited a generic performance task akin to pattern completion, where the error rate over completed patterns is used to `externally' judge clustering utility. Given this performance task, we adapt resampling-based pruning strategies used by supervised learning systems to the task of simplifying hierarchical clusterings, thus promising to ease post-clustering analysis. Finally, we propose a number of objective functions, based on attribute-selection measures for decision-tree induction, that might perform well on the error rate and simplicity dimensions.", keywords = "clustering, iterative optimization, cluster validation, resampling, pruning, objective functions", } @Article{flach:inductive-characterisation:90, URL = "ftp://ftp.gmd.de/MachineLearning/ILP/public/papers/flach-ITKreport23.ps.Z", title = "Inductive characterisation of database relations.", author = "P. A. Flach", year = "1990", note = "In Proc. International Symposium on Methodologies for Intelligent Systems, Z.W. Ras, M. Zemankowa \& M.L. Emrich (eds.), pp. 371-378, North-Holland, Amsterdam. ITK Research Report No. 23.", } @Article{flanagan:10-hottest:96, author = "Patrick Flanagan", title = "10 hottest technologies in telecom", journal = "Telecommunications (Americas Edition)", volume = "30", number = "5", month = may, year = "1996", ISSN = "0278-4831", classification = "716.1; 722.3; 723.1.1; 901", journalabr = "Telecommunications Am Ed", keywords = "Asynchronous transfer mode; Automated network management; Cable modems; Computer networks; Computer programming languages; Data mining; Electric relays; Internet appliances; Intranet; Java programming language; Local area networks; Modems; Personal communication systems; Personal satellite phones; Technology; Telecommunication; Telecommunication systems; Telecommunication technology; Voice over frame relay; Voice/data communication systems", pages = "6", } @InProceedings{flockhart.ea:genetic-algorithm-based:96, title = "A Genetic Algorithm-Based Approach to Data Mining", pages = "299", author = "Ian W. Flockhart and Nicholas J. Radcliffe", crossref = "simoudis.ea:proceedings-second:96", } @InCollection{forsyth:inductive-learning:89, author = "Richard Forsyth", title = "Inductive Learning for Expert Systems", booktitle = "Expert Systems Principles and Case Studies", publisher = "Chapman and Hall, New York", year = "1989", } @InProceedings{frawley.ea:overview:91, author = "W. J. Frawley and G. Piatetsky-Shapiro and C. J. Matheus", title = "Knowledge discovery in databases: an overview", editor = "G. Piatetsky-Shapiro and W. J. Frawley", booktitle = "Knowledge discovery in databases", pages = "1--27", publisher = "AAAI Press/MIT Press", address = "Menlo Park, CA/Cambridge, MA", year = "1991", } @Article{frawley.ea:overview:92a, key_modifier = "a", author = "W. Frawley and G. Piatetsky-Shapiro and C. Matheus", title = "Knowledge Discovery in Databases: An Overview.", journal = "AI Magazine", year = "1992", pages = "213--228", month = "Fall (Autumn)", abstract = "After a decade of fundamental interdisciplinary research in machine learning, the spadework in this field has been done; the 1990s should see the widespread exploitation of knowledge discovery as an aid to assembling knowledge bases. The contributors to the AAAI Press book Knowledge Discovery in Databases were excited at the potential benefits of this research. The editors hope that some of this excitement will communicate itself to AI Magazine readers of this article.", note = "Reprint of the introductory chapter of {\em Knowledge Discovery in Databases} collection, AAAI/MIT Press, 1991.", annote = "Conflicting page numbers in another bibtex entry!", } @Article{frawley.ea:overview:92b, key_modifier = "b", author = "W. J. Frawley and G. Piatetsky-Shapiro and C. J. Matheus", address = "Gte Labs Inc, Distributed Cooperating Learning Syst Project, Waltham, Ma, 02254 Gte Labs Inc, Knowledge Discovery Databases Project, Waltham, Ma, 02254", title = "Knowledge discovery in databases - an overview", journal = "Ai Magazine", year = "1992", volume = "13", issue = "3", pages = "57--70", abstract = "After a decade of fundamental interdisciplinary research in machine learning, the spadework in this field has been done; the 1990s should see the widespread exploitation of knowledge discovery as an aid to assembling knowledge bases. The contributors to the AAAI Press book Knowledge Discovery in Databases were excited at the potential benefits of this research. The editors hope that some of this excitement will communicate itself to AI Magazine readers of this article.", annote = "Conflicting page numbers in another bibtex entry!", } @Article{frawley:using-function:91, crossref = "piatetsky-shapiro.ea:knowledge-discovery:91", editor = "Gregory Piatetsky-Shapiro and William J. Frawley", booktitle = "Knowledge Discovery in Databases", publisher = "AAAI Press / The MIT Press", address = "Menlo Park, California", edition = "1st", year = "1991", author = "W. Frawley", title = "Using function to encode domain and contextual knowledge in statistical induction", annote = "Details of the FBI system for decision tree induction", } @TechReport{freitas.ea:data-parallel-primitive:95, URL = "ftp://ftp.essex.ac.uk/pub/csc/technical-reports/CSM-242.ps.Z", title = "A data-parallel primitive for high-performance knowledge discovery in large databases", author = "S. H. Freitas and A. A. Lavington", number = "Internal Report CSM-242", institution = "University of Essex, UK", month = may, year = "1995", abstract = "Efficiency is crucial in KDD (Knowledge Discovery in Databases), due to the huge amount of data stores in current databases. We argue that high efficiency in KDD can be achieved by combining two approaches, namely encapsulating KDD functionally within standard DBMS operations and using parallel processing. Hence, KDD tasks can be executed on a back-end SQL server, e.g. a parallel DB machine. We propose a KDD primitive (a set of basic operations) which underlies the candidate-rule evaluation procedures of many KDD algorithms. We compare and analyse the time required to carry out this primitive on three different computational architecture, viz. a conventional workstation and two parallel DB machines. The main advantages of encapsulating a KDD primitive in a parallel DB server are automatic parallelization and the run-time speed which can be achieved through parallel processing.", } @Article{freitas.ea:parallel-very:96, author = "A. A. Freitas and S. H. Lavington", title = "Parallel Data Mining for Very Large Relational Databases", journal = "Lecture Notes in Computer Science", volume = "1067", pages = "158--??", year = "1996", ISSN = "0302-9743", } @Article{freitas.ea:speeding-up:96, author = "A. A. Freitas and S. H. Lavington", title = "Speeding up Knowledge Discovery in Large Relational Databases by Means of a New Discretization Algorithm", journal = "Lecture Notes in Computer Science", volume = "1094", pages = "124--??", year = "1996", ISSN = "0302-9743", } @InProceedings{friedman.ea:lazy-decision:96, author = "Jerome Friedman and Ron Kohavi and Yeogirl Yun", title = "Lazy Decision Trees", booktitle = "Proceedings of the Thirteenth National Conference on Artificial Intelligence", publisher = "AAAI Press and the MIT Press", year = "1996", pages = "717--724", URL = "http://robotics.stanford.edu/users/ronnyk", month = aug, contributedby = "Ronny Kohavi, ronnyk(at)sgi.com", } @InProceedings{fukuda.ea:optimized-association:96, author = "T. Fukuda and Y. Morimoto and S. Morishita and T. Tokuyama", title = "Mining Optimized Association Rules for Numeric Attributes", editor = "{ACM}", booktitle = "Proceedings of the Fifteenth {ACM} {SIGACT}-{SIGMOD}-{SIGART} Symposium on Principles of Database Systems, {PODS} 1996, Montr{\'e}al, Canada, June 3--5, 1996", volume = "15", publisher = "ACM Press", address = "New York, NY 10036, USA", year = "1996", series = "Proceedings of the ACM SIGACT SIGMOD SIGART Symposium on Principles of Database Systems", pages = "182--191", annote = "Held in conjunction with the 1996 ACM SIGMOD international conference on management of data. Also known as PODS 1996", keywords = "database systems; PODS; ACM; SIGMOD; SIGART; SIGACT", } @InProceedings{fukuda.ea:using-two-dimensional:96, title = "Data Mining using Two-dimensional Optimized Association Rules: Scheme, Algorithms, and Visualization", author = "Takeshi Fukuda and Yasuhiko Morimoto and Shinichi Morishita and Takeshi Tokuyama", editor = "H. V. Jagadish and Inderpal Singh Mumick", booktitle = "Proceedings of the 1996 {ACM} {SIGMOD} International Conference on Management of Data", address = "Montreal, Quebec, Canada", month = "4--6~" # jun, year = "1996", pages = "13--23", } @InProceedings{fulton.ea:local-induction:96, title = "Local Induction of Decision Trees: Towards Interactive Data Mining", pages = "14", author = "Truxton Fulton and Steven Salzberg and Simon Kasif and David Waltz", crossref = "simoudis.ea:proceedings-second:96", } @Article{furnkranz.ea:international-conflict:97, author = "J. Furnkranz and J. Petrak and R. Trappl", address = "Austrian Res Inst Artificial Intelligence, Schottengasse 3, a-1010 Vienna, Austria Austrian Res Inst Artificial Intelligence, a-1010 Vienna, Austria", title = "Knowledge discovery in international conflict databases", journal = "Applied Artificial Intelligence", year = "1997", volume = "11", issue = "2", pages = "91--118", abstract = "Artificial intelligence (AI) is heavily supported by military institutions, while practically no effort goes into the investigation of possible contributions of AI to the avoidance and termination of crises and wars. This article rakes a first step in this direction by investigating the use of machine learning techniques for discovering knowledge in international conflict and conflict management databases. We have applied similarity-based case retrieval to the KOSIMO database of international conflicts. Furthermore, we present results of analyzing the CONFMAN database of successful and unsuccessful conflict management attempts with an inductive decision tree learning algorithm. The latter approach seems to be particularly promising, as conflict management events apparently are more repetitive and thus better suited for machine-aided analysis.", keywords = "MEDIATION", } @InProceedings{furnkranz:comparison-pruning:94, author = "J. F{\"{u}}rnkranz", title = "A Comparison of Pruning Methods for Relational Concept Learning", booktitle = "Proceedings of the AAAI-94 Workshop on Knowledge Discovery in Databases", year = "1994", } @Article{gaines.ea:induction-meta-knowledge:93, author = "B. R. Gaines and P. Compton", address = "Univ Calgary, Inst Knowledge Sci, Calgary T2N 1N4, Ab, Canada Univ New S Wales, Dept Comp Sci, Sydney, Nsw 2033, Australia", title = "Induction of meta-knowledge about knowledge discovery", journal = "Ieee Trans. On Knowledge And Data Engineering", year = "1993", volume = "5", issue = "6", pages = "990--992", abstract = "A study is reported of the use of ripple-down rule induction to develop a meta-model of ten years of clinical data captured as part of the development of an expert system for thyroid diagnosis. The study shows how the suitability for inductive knowledge discovery of such real-world data can be characterized in terms of its stationarity, and how the best error rates achievable and the amount of data necessary to achieve them, can be estimated.", keywords = "GARVAN THYROID DATABASE, INDUCT, INDUCTION, KNOWLEDGE DISCOVERY, MACHINE LEARNING, MEDICAL DIAGNOSIS, METAMODELING, META-KNOWLEDGE, RIPPLE-DOWN RULES, RULES WITH EXCEPTIONS", } @InProceedings{galal.ea:improving-scalability:97, title = "Improving Scalability in a Scientific Discovery System by Exploiting Parallelism", author = "Gehad Galal and Diane J. Cook and Lawrence B. Holder", pages = "171", crossref = "heckerman.ea:proceedings-third:97", } @Article{gallant:connectionist-expert:88, author = "Stephen I. Gallant", title = "Connectionist Expert Systems", journal = "Communications of the ACM", year = "1988", volume = "32", number = "2", pages = "153--168", } @InProceedings{ganesh.ea:entity-identification-rules:96, title = "Mining Entity-Identification Rules for Database Integration", pages = "291", author = "M. Ganesh and Jaideep Srivastava and Travis Richardson", crossref = "simoudis.ea:proceedings-second:96", } @Article{gebhardt:discovering-interesting:94, author = "F. Gebhardt", address = "Gesell Math \& Datenverarbeitung Gmbh, Schloss Birlinghoven, Postfach 1316, D-53731 St Augustin, Germany", title = "Discovering interesting statements from a database", journal = "Applied Stochastic Models And Data Analysis", year = "1994", volume = "10", issue = "1", pages = "1--14", abstract = "Knowledge discovery aims at extracting new knowledge from potentially large databases; this may be in the form of interesting statements about the data. Two interrelated classes of problem arise that are treated here: to put the subjective notion of 'interesting' into concrete terms and to deal with large numbers of statements that are related to one another (one rendering the other redundant or at least less interesting). Four increasingly subjective facets of 'interestingness' are identified: the subject field under consideration, the conspicuousness of a finding, its novelty, and its deviation from prior knowledge. A procedure is proposed, and tried out on two quite different data sets, that allows for specifying interestingness by various means and that ranks the results in a way that takes interestingness (relevance, evidence) as well as mutual relatedness (similarity, affinity) into account- manifestations of the second and third facets of interestingness in the given data environment.", keywords = "PROJECTION PURSUIT, KNOWLEDGE DISCOVERY IN DATABASES, EXPLORATORY DATA ANALYSIS, INTERESTINGNESS, PROJECT EXPLORA", } @Article{gerber:excavate-your:96, author = "Cheryl Gerber", title = "Excavate Your Data", journal = "Datamation", year = "1996", volume = "42", number = "9", month = may, abstract = "Datamining could be your No. 1 strategic weapon--and source of profit--in dissecting archival information. But with its roots in machine learning, this esoteric technology takes some time to master.", URL = "http://www.datamation.com/PlugIn/issues/1996/may1/05asoft3frame.html", } @Article{giordana.ea:enigma-system:93, author = "A. Giordana and L. Saitta and F. Bergadano and F. Brancadori and D. De Marchi", title = "{ENIGMA}: {A} System that Learns Diagnostic Knowledge", journal = "IEEE Transactions on Knowledge and Data Engineering", pages = "15--28", volume = "5", number = "1", month = feb, year = "1993", } @Article{glymour.ea:statistical-inference:96, author = "Clark Glymour and David Madigan and Daryl Pregibon and Padhraic Smyth", address = "Carnegie Mellon Univ, Pittsburgh, Pa, 15213 Univ Calif San Diego, La Jolla, Ca, 92093 Washington Univ, Seattle, Wa At\&T Bell Labs, Murray Hill, Nj, 07974", title = "Statistical Inference and Data Mining", journal = "Communications of the ACM", volume = "39", number = "11", pages = "35--41", month = nov, year = "1996", ISSN = "0001-0782", } @Article{goh.ea:deductive-with:96, author = "C. L. Goh and M. Tsukamoto and S. Nishio", address = "Osaka Univ, Fac Engn, Dept Informat Syst Engn, 2-1 Yamadaoka, Suita, Osaka 565, Japan", title = "Knowledge discovery in deductive databases with large deduction results: the first step", journal = "Ieee Trans. On Knowledge And Data Engineering", year = "1996", month = dec, volume = "8", issue = "6", pages = "952--956", abstract = "Deductive databases have the ability to deduce new facts from a set of facts using a set of rules. They are also useful in the integration of artificial intelligence and database. However, when recursive rules are involved, the amount of deduced facts can become too large to be practically stored, viewed or analyzed. This seriously hinders the usefulness of deductive databases. In order to overcome this problem, we propose four methods to discover characteristic rules from large amount of deduction results without actually having to store all the deduction results. This paper presents the first step in the application of knowledge discovery techniques to deductive databases with large deduction results.", keywords = "attribute-oriented algorithm, characteristic rule, data mining, deductive database, recursive rule", } @InProceedings{greene.ea:cogin-symbolic:92, author = "D. P. Greene and S. F. Smith", title = "{COGIN}: Symbolic Induction with Genetic Algorithms", year = "1992", booktitle = "Proc.\ of AAAI-92", pages = "111--116", keywords = "GA", } @Article{grinstein.ea:visualization-for:92, crossref = "ijis-special-issue:92", author = "G. Grinstein and J. C. Sieg and S. Smith and M. G. Williams", address = "Univ Massachusetts, Lowell, Ma, 01854", title = "Visualization for knowledge discovery", journal = "International J. Of Intelligent Systems", year = "1992", volume = "7", issue = "7", pages = "637--648", abstract = "Although the fields of data visualization and automated knowledge discovery (AKD) share many goals, workers in each field have been reluctant to adopt the tools and methods of the other field. Many AKD researchers discourage the use of visualization tools because they believe that dependence on human steering will impede the development of numerical or analytical descriptions of complex data. Many visualization researchers are concerned that their present platforms are being pushed to the limits of their performance by the most advanced visualization techniques and are therefore unwilling to incur the perceived overhead of having a database system mediate access to the data. We argue that these attitudes are somewhat short- sighted and that the techniques of these two communities are complementary. We discuss a specific visualization system that we have developed and describe the obstacles that must be overcome in integrating it into an AKD system.", annote = "Deals with Exvis system, which produces textures of icons, each icon representing upto 15 variables + X,Y co-ords. Also use of sound textures.", } @InProceedings{grinstein:harnessing-human:96, title = "Harnessing the Human in Knowledge Discovery", pages = "384", author = "Georges G. Grinstein", crossref = "simoudis.ea:proceedings-second:96", } @InProceedings{grossman.ea:tree-based-optimization:96, title = "Data Mining and Tree-Based Optimization", pages = "323", author = "Robert Grossman and Haim Bodek and Dave Northcutt and Vince Poor", crossref = "simoudis.ea:proceedings-second:96", } @InProceedings{grossman:terabyte-challenge:96, author = "Robert Grossman", title = "The Terabyte Challenge: An Open, Distributed Testbed for Managing and Mining Massive Data Sets", booktitle = "CD-ROM Proceedings of Supercomputing'96", publisher = "IEEE", address = "Pittsburgh, PA", month = nov, year = "1996", keywords = "contest,", } @Article{grupe.ea:data-base-discovering:95, author = "F. H. Grupe and M. M. Owrang", address = "Univ Nevada, Reno, Nv, 89557 American Univ, Washington, Dc, 20016", title = "Data-base mining - discovering new knowledge and competitive advantage", journal = "Information Systems Management", year = "1995", volume = "12", issue = "4", pages = "26--31", abstract = "Buried in the huge data bases assembled by large organizations is information useful for generating new facts and relationships that can provide significant competitive advantage. This article describes how data base mining extracts knowledge from existing data bases, data base mining applications and their limitations, and bottom-line benefits.", } @InProceedings{gunopulos.ea:hypergraph-transversals:97, title = "Data mining, Hypergraph Transversals, and Machine Learning", author = "Dimitrios Gunopulos and Roni Khardon and Heikki Mannila and Hannu Toivonen", booktitle = "Proceedings of the Sixteenth {ACM} {SIGACT}-{SIGMOD}-{SIGART} Symposium on Principles of Database Systems", month = "12--15 " # may, year = "1997", address = "Tucson, Arizona", } @Article{gunter:mother-lode:96, author = "B. Gunter", address = "Pob 9, Hopewell, Nj, 08525", title = "Data mining - mother lode or fools gold", journal = "Quality Progress", year = "1996", volume = "29", issue = "4", pages = "113", } @Article{guo.ea:classification-trees:92, author = "Heng Guo and Saul B. Gelfand", title = "Classification trees with Neural Network Feature Extraction", journal = "IEEE Transactions on Neural Networks.", year = "1992", volume = "3", number = "6", pages = "923--933", month = nov, keywords = "Neural Nets, binary decision trees, CART", annote = "Uses small multilayer nets at the decision nodes of a binary classification tree. Comparison with CART", } @InProceedings{hahn.ea:deep-natural:97, title = "Deep Knowledge Discovery from Natural Language Texts", author = "Udo Hahn and Klemens Schnattinger", pages = "175", crossref = "heckerman.ea:proceedings-third:97", } @InProceedings{haimowitz.ea:integrating-distributed:97, title = "Integrating and Mining Distributed Customer Databases", author = "Ira J. Haimowitz and {\"{o}}zden G{\"{u}}r-Ali and Henry Schwarz", pages = "179", crossref = "heckerman.ea:proceedings-third:97", } @Article{hale.ea:analyzing-fd:96, author = "J. Hale and S. Shenoi", address = "Univ Tulsa, Dept Math \& Comp Sci, Keplinger Hall, Tulsa, Ok, 74104 Univ Tulsa, Dept Math \& Comp Sci, Tulsa, Ok, 74104", title = "Analyzing fd inference in relational databases", journal = "Data \& Knowledge Engineering", year = "1996", volume = "18", issue = "2", pages = "167--183", abstract = "This paper deals with the general problem of analyzing fuzzy inference based on functional dependencies (FDs) in database relations. Fuzzy inference, the ability to infer fuzzy set values, generalizes imprecise (set-valued) inference and precise inference. Likewise, fuzzy relational databases generalize their classical and imprecise counterparts by supporting fuzzy information storage and retrieval. Inference analysis is performed using a special abstract model which maintains vital links to classical, imprecise and fuzzy relational database models. These links increase the utility of the inference formalism in practical applications involving ''catalytic inference analysis'', including knowledge discovery and database security.", keywords = "DATABASE INFERENCE, FUNCTIONAL DEPENDENCIES, KNOWLEDGE DISCOVERY, DATABASE SECURITY, FUZZY SETS", } @Article{hale.ea:practical-formalism:94, author = "J. Hale and J. Threet and S. Shenoi", address = "Univ Tulsa, Dept Math \& Comp Sci, Keplinger Hall, Tulsa, Ok, 74104", title = "A practical formalism for imprecise inference control", journal = "Ifip Trans. A-Computer Science And Technology", year = "1994", volume = "60", pages = "139--156", abstract = "This paper describes a powerful, yet practical, formalism for modeling and controlling imprecise FD-based inference in relational database systems. The formalism provides a canonical representation of inference which unifies precise inference and the primitive imprecise inference mechanisms of abduction and partial deduction. Whereas other imprecise (partial) inference models estimate the probability of making inferences, the formalism supports the analysis of the actual imprecise values inferred in a database extension. Imprecise inference is analyzed by transforming a precise database augmented with additional ''catalytic'' relations, conveying possibly imprecise a priori knowledge, into an equivalent imprecise database. The analysis of imprecise inference and the related infer ence control methodology are highly flexible and robust. They can be directly applied to classical, MLS, and imprecise databases. With minimal modifications, they also can be used in knowledge discovery or database mining.", keywords = "DATABASE MANAGEMENT, GENERAL, ARTIFICIAL INTELLIGENCE, DEDUCTION AND THEOREM PROVING", } @Article{hamilton.ea:estimating-dblearns:95, author = "H. J. Hamilton and D. R. Fudger", address = "Univ Regina, Dept Comp Sci, Regina, Sk S4S 0A2, Canada", title = "Estimating dblearns potential for knowledge discovery in databases", journal = "Computational Intelligence", year = "1995", volume = "11", issue = "2", pages = "280--296", abstract = "We propose a procedure for estimating DBLEARN's potential for knowledge discovery, given a relational database and concept hierarchies. This procedure is most useful for evaluating alternative concept hierarchies for the same database. The DBLEARN knowledge discovery program uses an attribute-oriented inductive-inference method to discover potentially significant high-level relationships in a database. A concept forest, with at most one concept hierarchy for each attribute, defines the possible generalizations that DBLEARN can make for a database. The potential for discovery in a database is estimated by examining the complexity of the corresponding concept forest. Two heuristic measures are defined based on the number, depth, and height of the interior nodes. Higher values for these measures indicate more complex concept forests and arguably more potential for discovery. Experimental results using a variety of concept forests and four commercial databases show that in practice both measures permit quite reliable decisions to be made; thus, the simplest may be most appropriate.", keywords = "KNOWLEDGE DISCOVERY, CONCEPT HIERARCHIES, DISCOVERY POTENTIAL, DATABASES, MACHINE LEARNING", } @InProceedings{han.ea:attribute-oriented-approach:92, author = "Jiawei Han and Yandong Cai and Nick Cercone", title = "Knowledge Discovery in Databases: An Attribute-oriented Approach", booktitle = "Proceedings of the 18th {VLDB} Conference", pages = "547--559", address = "Vancouver, British Columbia, Canada", year = "1992", month = aug, keywords = "dblearn", annote = "simple hierarchies are used to generate attribute summaries", } @Article{han.ea:data-driven-quantitative:93, author = "J. W. Han and Y. D. Cai and N. Cercone", address = "Simon Fraser Univ, Sch Comp Sci, Ctr Syst Sci, Burnaby V5A 1S6, Bc, Canada", title = "Data-driven discovery of quantitative rules in relational databases", journal = "Ieee Trans. On Knowledge And Data Engineering", year = "1993", month = "Feburary", volume = "5", issue = "1", pages = "29--40", abstract = "A quantitative rule is a rule associated with quantitative information which assesses the representativeness of the rule in the database. In this paper, an efficient induction method is developed for learning quantitative rules in relational databases. With the assistance of knowledge about concept hierarchies, data relevance, and expected rule forms, attribute-oriented induction can be performed on the database, which integrates database operations with the learning process and provides a simple, efficient way of learning quantitative rules from large databases. Our method learns both characteristic rules and classification rules. Quantitative information facilitates quantitative reasoning, incremental learning, and learning in the presence of noise. Moreover, learning qualitative rules can be treated as a special case of learning quantitative rules. Our paper shows that attribute-oriented induction provides an efficient and effective mechanism for learning various kinds of knowledge rules from relational databases.", keywords = "KNOWLEDGE DISCOVERY IN DATABASES, MACHINE LEARNING, ATTRIBUTE- ORIENTED INDUCTION, QUANTITATIVE RULES, CHARACTERISTIC RULES, CLASSIFICATION RULES, DATA-DRIVEN LEARNING ALGORITHMS", } @InProceedings{han.ea:dblearn-system:92, author = "J. Han and Y. Cai and N. Cerone and Y. Huang", title = "{DBLEARN}: {A} Knowledge Discovery System for Large Databases", booktitle = "Int. Conf. on Information and Knowledge Management, Baltimore", year = "1992", month = nov, } @Article{han.ea:dblearn-system:94, author = "Jiawei Han and Yongjian Fu and Yue Huang and Yandong Cai and N. Cercone", title = "{DBLearn}: {A} System Prototype for Knowledge Discovery in Relational Databases", journal = "SIGMOD Record (ACM Special Interest Group on Management of Data)", volume = "23", number = "2", pages = "516--516", month = jun, year = "1994", ISSN = "0163-5808", affiliation = "Sch. of Comput. Sci., Simon Fraser Univ., Burnaby, BC, Canada", classification = "C6160D (Relational DBMS); C6160K (Deductive databases)", keywords = "DBLearn; System prototype; Knowledge discovery; Relational databases; Data mining system; Knowledge rule extraction; High level learning interfaces; Automatic refinement; Concept hierarchies; Efficient discovery algorithms; Performance; Knowledge mining; Object-oriented databases; Deductive databases; Spatial databases", thesaurus = "Deductive databases; Knowledge acquisition; Relational databases; Very large databases", xxcrossref = "Anonymous:1994:ASI", } @InProceedings{han.ea:dbminer-interactive:96, title = "{DBMiner}: Interactive Mining of Multiple-Level Knowledge in Relational Databases", author = "Jiawei Han and Yongjian Fu and Wei Wang and Jenny Chiang and Osmar R. Za{\"\i}ane and Krzysztof Koperski", editor = "H. V. Jagadish and Inderpal Singh Mumick", booktitle = "Proceedings of the 1996 {ACM} {SIGMOD} International Conference on Management of Data", address = "Montreal, Quebec, Canada", month = "4--6~" # jun, year = "1996", pages = "550", } @InProceedings{han.ea:dbminer-system:96, title = "{DBM}iner: {A} System for Mining Knowledge in Large Relational Databases", pages = "250", author = "Jiawei Han and Yongjian Fu and Wei Wang and Jenny Chiang and Wan Gong and Krzystof Koperski and Deyi Li and Yijun Lu and Amynmohamed Rajan and Nebojsa Stefanovic and Betty Xia and Osmar R. Zaiane", crossref = "simoudis.ea:proceedings-second:96", } @Article{han.ea:intelligent-query:96, author = "J. W. Han and Y. Huang and N. Cercone and Y. J. Fu", address = "Simon Fraser Univ, Sch Comp Sci, Burnaby, Bc V5A 1S6, Canada Univ Regina, Dept Comp Sci, Regina, Sk S4S 0A2, Canada", title = "Intelligent query answering by knowledge discovery techniques", journal = "Ieee Trans. On Knowledge And Data Engineering", year = "1996", volume = "8", issue = "3", pages = "373--390", abstract = "Knowledge discovery facilitates querying database knowledge and intelligent query answering in database systems. In this paper, we investigate the application of discovered knowledge, concept hierarchies, and knowledge discovery tools for intelligent query answering in database systems. A knowledge-rich data. model is constructed to incorporate discovered knowledge and knowledge discovery tools. Queries are classified into data queries and knowledge queries. Both types of queries can be answered directly by simple retrieval or intelligently by analyzing the intent of query and providing generalized, neighborhood or associated information using stored or discovered knowledge. Techniques have been developed for intelligent query answering using discovered knowledge and/or knowledge discovery tools, which includes generalization, data summarization, concept clustering, rule discovery, query rewriting, deduction, lazy evaluation, application of multiple-layered databases, etc. Our study shows that knowledge discovery substantially broadens the spectrum of intelligent query answering and may have deep implications on query answering in data- and knowledge-base systems.", keywords = "RELATIONAL DATABASES, MODEL, DATABASE AND KNOWLEDGE-BASE SYSTEMS, KNOWLEDGE DISCOVERY IN DATABASES, KNOWLEDGE-RICH DATA MODEL, INTELLIGENT QUERY ANSWERING, MULTIPLE LAYERED DATABASES, QUERY ANALYSIS AND QUERY PROCESSING", } @TechReport{han.ea:multi-dimensional-association:97, number = "TR 97-06", author = "Jiawei Han and Micheline Kamber and Jenny Chiang", title = "Mining Multi-Dimensional Association Rules Using Data Cubes", month = feb, year = "1997", org = "SFU-CMPT", school = "School of Computing Science, Simon Fraser University", abstract = "Methods for mining association rules have been studied extensively. However, most previous studies have been confined to the mining of single dimensional and single variable association rules. There are applications in relational databases and data warehouses which require the mining of multi-dimensional association rules. In this paper, we study efficient methods for mining multi-dimensional association rules using a data cube structure, a popular data structure used in data warehouses. Efficient algorithms are developed for mining multi-dimensional association rules by either using an existing data cube, when available, or construction of a data cube on the fly. In both cases, the algorithms outperform the direct application of a table-based Apriori algorithm to the mining of multi-dimensional association rules. The extension of the method for mining multi-level, multi-dimensional association rules and meta-rule guided mining is also discussed in the paper.", URL = "ftp://fas.sfu.ca/pub/cs/TR/1997/CMPT97-06.ps.Z", } @TechReport{han.ea:multiple-level-association:95, number = "TR 95-05", author = "Jiawei Han and Yongjian Fu", title = "Discovery of Multiple-Level Association Rules from Large Databases", month = mar, year = "1995", org = "SFU-CMPT", school = "School of Computing Science, Simon Fraser University", pages = "35", abstract = "Discovery of association rules from large databases has been a focused topic recently in the research into database mining. Previous studies discover association rules at a single concept level, however, mining association rules at multiple concept levels may lead to finding more informative and refined knowledge from data. In this paper, we study efficient methods for mining multiple-level association rules from large transaction databases. A top-down progressive deepening method is proposed by extension of some existing (single-level) association rule mining algorithms. In particular, a group of algorithms for mining multiple-level association rules are developed and their relative performance are tested on different kinds of transaction data. Relaxation of the rule conditions for finding flexible multiple-level association rules is also discussed. Our study shows that efficient algorithms can be developed for the discovery of interesting and strong multiple-level association rules from large databases.", URL = "ftp://ftp.fas.sfu.ca/pub/cs/techreports/1995/CMPT95-05.ps.Z", } @TechReport{han.ea:resource-global:94, number = "TR 94-10", author = "Jiawei Han and Osmar R. Zaiane and Yongjian Fu", title = "Resource and Knowledge Discovery in Global Information Systems: {A} Multiple Layered Database Approach", month = nov, year = "1994", org = "SFU-CMPT", school = "School of Computing Science, Simon Fraser University", pages = "30", keywords = "Resource Discovery, Knowledge Discovery, Data Mining, Multiple Layered Database, Internet, World Wide Web, Global Information Network", abstract = "With huge amounts of information connected to the global information network (Internet), efficient and effective discovery of resource and knowledge from the ``global information base'' has become an imminent research issue, especially with the advent of the Information Highway. In this article, a multiple layered database (MLDB) approach is proposed to handle the resource and knowledge discovery in global information base. A multiple layered database is a database formed by generalization and transformation of the information, layer-by-layer, starting from the original information base (treated as layer-0, the primitive layer). Information retrieval, data mining, and data analysis techniques can be used to extract and transform information from a lower layer database to a higher one. Layer-1 and higher layers of an MLDB can be modeled by an extended-relational or object-oriented model, constructed automatically, and updated incrementally. Information at all the layers except the primitive one can be stored, managed and retrieved by the available database technology; resources can be found by controlled search through different layers of the database; and knowledge discovery can be performed efficiently in such a multiple layered database.", URL = "ftp://ftp.fas.sfu.ca/pub/cs/techreports/1994/CMPT94-10.ps.Z", note = "(also CSS/LCCR TR94-24)", } @InProceedings{han:techniques:96, title = "Data Mining Techniques", author = "Jiawei Han", editor = "H. V. Jagadish and Inderpal Singh Mumick", booktitle = "Proceedings of the 1996 {ACM} {SIGMOD} International Conference on Management of Data", address = "Montreal, Quebec, Canada", month = "4--6~" # jun, year = "1996", pages = "545", } @Article{han:towards-efficient:94, author = "J. W. Han", address = "Simon Fraser Univ, Sch Comp Sci, Burnaby V5A 1S6, Bc, Canada", title = "Towards efficient induction mechanisms in database-systems", journal = "Theoretical Computer Science", year = "1994", volume = "133", issue = "2", pages = "361--385", abstract = "With the wide availability of huge amounts of data in database systems, the extraction of knowledge in databases by efficient and powerful induction or knowledge discovery mechanisms has become an important issue in the construction of new generation database and knowledge-base systems. In this article, an attribute-oriented induction method for knowledge discovery in databases is investigated, which provides an efficient, set-oriented induction mechanism for extraction of different kinds of knowledge rules, such as characteristic rules, discriminant rules, data evolution regularities and high level dependency rules in large relational databases. Our study shows that the method is robust in the existence of noise and database updates, is extensible to knowledge discovery in advanced and/or special purpose databases, such as object-oriented databases, active databases, spatial databases, etc., and has wide applications.", } @InProceedings{harris.ea:mega-classification-discovering:92, author = "N. L. Harris and L. Hunter and D. J. States", title = "Mega-Classification: Discovering Motifs in Massive Datastreams", year = "1992", booktitle = "Proc.\ of AAAI-92", pages = "837--842", } @InProceedings{hatonen.ea:telecommunication-network:96, author = "K. Hatonen and M. Klemettinen and H. Mannila and P. Ronkainen and H. Toivonen", title = "Knowledge discovery from telecommunication network alarm databases", editor = "Stanley Y. W. Su", booktitle = "Proceedings of the twelfth International Conference on Data Engineering, February 26--March 1, 1996, New Orleans, Louisiana", publisher = "IEEE Computer Society Press", address = "1109 Spring Street, Suite 300, Silver Spring, MD 20910, USA", year = "1996", pages = "115--122", affiliation = "Dept. of Comput. Sci., Helsinki Univ., Finland", keywords = "Knowledge discovery, data mining, frequent episodes, telecommunication alarm databases, telecommunication network management, SGML", URL = "http://www.cs.Helsinki.FI/research/pmdm/datamining/ICDE96.html", abstract = "A telecommunication network produces daily large amounts of alarm data. The data contains hidden valuable knowledge about the behavior of the network. This knowledge can be used in filtering redundant alarms, locating problems in the network, and possibly in predicting severe faults. We describe the TASA (Telecommunication Network Alarm Sequence Analyzer) system for discovering and browsing knowledge from large alarm databases. The system is built on the basis of viewing knowledge discovery as an interactive and iterative process, containing data collection, pattern discovery, rule postprocessing, etc. The system uses a novel framework for locating frequently occurring episodes from sequential data. The TASA system offers a variety of selection and ordering criteria for episodes, and supports iterative retrieval from the discovered knowledge. This means that a large part of the iterative nature of the KDD process can be replaced by iteration in the rule postprocessing stage. The user interface is based on dynamically generated HTML. The system is in experimental use, and the results are encouraging: some of the discovered knowledge is being integrated into the alarm handling software of telecommunication operators.", } @Article{hau.ea:learning-qualitative:97, author = "D. T. Hau and E. W. Coiera", address = "Johns Hopkins Univ, Sch Med, Baltimore, Md, 21205 Hewlett Packard Labs, Bristol Bs12 6Qz, Avon, England", title = "Learning qualitative models of dynamic systems", journal = "Machine Learning", year = "1997", volume = "26", issue = "2-3", pages = "177--211", abstract = "The automated construction of dynamic system models is an important application area for ILP. We describe a method that learns qualitative models from time-varying physiological signals. The goal is to understand the complexity of the learning task when faced with numerical data, what signal processing techniques are required, and how this affects learning. The qualitative representation is based on Kuipers' QSIM. The learning algorithm for model construction is based on Coiera's GENMODEL. We show that QSIM models are efficiently PAC learnable from positive examples only, and that GENMODEL is an ILP algorithm for efficiently constructing a QSIM model. We describe both GENMODEL which performs RLGG on qualitative states to learn a QSIM model, and the front-end processing and segmenting stages that transform a signal into a set of qualitative states. Next we describe results of experiments on data from six cardiac bypass patients. Useful models were obtained, representing both normal and abnormal physiological states. Model variation across time and across different levels of temporal abstraction and fault tolerance is explored. The assumption made by many previous workers that the abstraction of examples from data can be separated from the learning task is not supported by this study. Firstly, the effects of noise in the numerical data manifest themselves in the qualitative examples. Secondly, the models learned are directly dependent on the initial qualitative abstraction chosen.", keywords = "SIMULATION, inductive logic programming, qualitative modelling, system identification, PAC learning, physiological modelling, cardiovascular system, data mining, patient monitoring", } @Article{haughton:digging-gold:94, author = "Emma Haughton", title = "Digging For Gold", journal = "Computing", year = "1994", pages = "20--21", month = jan, keywords = "Logica, applications, neural networks, Data Mariner, Netmap, 4thought, autonet, recognition research", } @InProceedings{haussler.ea:kdd-science:96, title = "{KDD} for Science Data Analysis: Issues and Examples", pages = "50", author = "Usama Fayyadand David Haussler and Paul Stolorz", crossref = "simoudis.ea:proceedings-second:96", } @Article{haussler:quantifying-inductive:88, author = "D. Haussler", title = "Quantifying Inductive Bias: {AI} Learning Algorithms and Valiant's Learning Framework", year = "1988", journal = "Artificial Intelligence", volume = "36", number = "2", month = sep, pages = "177--221", } @Proceedings{heckerman.ea:proceedings-third:97, title = "Proceedings of the Third International Conference on Knowledge Discovery and Data Mining ({KDD}-97)", year = "1997", editor = "David Heckerman and Heikki Mannila and Daryl Pregibon and Ramasamy Uthurusamy", publisher = "AAAI Press", } @Article{hedberg:gold-rush:95, author = "Sara Reese Hedberg", title = "The Data Gold Rush --- Here's how corporations, researchers, and scientists are using data-mining techniques to discover everything from new customers to new galaxies", journal = "Byte Magazine", volume = "20", number = "10", pages = "83--??", month = oct, year = "1995", ISSN = "0360-5280", } @Article{hedberg:parallelism-speeds:95, author = "S. R. Hedberg", address = "Zymogenet Inc, 1201 Eastlake Ave E, Seattle, Wa, 98102", title = "Parallelism speeds data mining", journal = "Ieee Parallel \& Distributed Technology", year = "1995", volume = "3", issue = "4", pages = "3--6", } @Article{hedberg:searching-mother:96, author = "S. R. Hedberg", address = "Mit, Spoken Language Syst Grp, Cambridge, Ma, 02139", title = "Searching for the mother lode - tales of the first data miners", journal = "Ieee Expert-Intelligent Systems \& Their Applications", year = "1996", volume = "11", issue = "5", pages = "4--7", } @Article{heider:troubleshooting-cfm:96, author = "R. Heider", title = "Troubleshooting {CFM} 56-3 Engines for the {Boeing 737} --- Using {CBR} and Data-Mining", journal = "Lecture Notes in Computer Science", volume = "1168", pages = "512--??", year = "1996", ISSN = "0302-9743", } @InProceedings{hekanaho:ga-based-rule:97, title = "{GA}-Based Rule Enhancement in Concept Learning", author = "Jukka Hekanaho", pages = "183", crossref = "heckerman.ea:proceedings-third:97", } @InProceedings{hinke.ea:target-independent-scientific:97, title = "Target-Independent Mining for Scientific Data: Capturing Transients and Trends for Phenomena Mining", author = "Thomas H. Hinke and John Rushing and Heggere Ranganath and Sara J. Graves", pages = "187", crossref = "heckerman.ea:proceedings-third:97", } @InProceedings{ho.ea:zeta-global:97, title = "Zeta: {A} Global Method for Discretization of Continuous Variables", author = "K. M. Ho and P. D. Scott", pages = "191", crossref = "heckerman.ea:proceedings-third:97", } @InProceedings{hofacker.ea:rna-sequence:96, title = "Knowledge Discovery in {RNA} Sequence Families of {HIV} Using Scalable Computers", pages = "20", author = "Ivo L. Hofacker and Martijn A. Huynen and Peter F. Stadler and Paul E. Stolorz", crossref = "simoudis.ea:proceedings-second:96", } @InProceedings{hofmann.ea:inferring-hierarchical:96, title = "Inferring Hierarchical Clustering Structures by Deterministic Annealing", pages = "363", author = "Thomas Hofmann and Joachim M. Buhmann", crossref = "simoudis.ea:proceedings-second:96", } @Book{holland.ea:induction-processes:86, author = "John H. Holland and Keith J. Holyoak and Richard E. Nisbett and Paul R. Thagard", title = "Induction: processes of inference, learning and discovery", publisher = "MIT Press", year = "1986", series = "Computational models of cognition and perception", address = "Cambridge", } @Book{holland:adaptation-natural:75, author = "John H. Holland", title = "Adaptation in natural artificial systems", publisher = "University of Michigan Press", year = "1975", address = "Ann Arbor", } @InCollection{holland:escaping-brittleness:86, author = "John H. Holland", title = "Escaping brittleness: the possibilities of general purpose algorithms applied to parallel rule-based systems", crossref = "michalski.ea:machine-learning:86", pages = "593--623", } @TechReport{holsheimer.ea:architectural-support:, URL = "ftp://ftp.cwi.nl/pub/CWIreports/AA/CS-R9429.ps.Z", title = "Architectural Support for Data Mining", author = "Marcel Holsheimer and Martin L. Kersten", institution = "CWI Amsterdam", address = "PO Box 94079, 1090 GB, Amsterdam, The Netherlands", abstract = "One of the main obstacles in applying data mining techniques to large, real-world databases is the lack of efficient data management. In this paper, we present the design and implementation of an effective two-level architecture for a data mining environment. It consists of a mining tool and a parallel DBMS server. The mining tool organizes and controls the search process, while the DBMS provides optimal response times for the few query types being used by the tool. Key elements of our architecture are its use of fast and simple database operations, its re-use of results obtained by previous queries, its maximal use of main-memory to keep the database hot-set resident, and its parallel computation of queries.

Apart from a clear separation of responsibilities, we show that this architecture leads to competitive performance on large data sets. Moreover, this architecture provides a flexible experimentation platform for further studies in optimization of repetitive database queries and quality driven rule discovery schemes.

CR subject classification (1991): Data storage representations (E.2), Database systems (H.2.4) parallel systems, query processing, Information search and retrieval (H.3.3), Learning (I.2.6) induction, knowledge acquisition

Keywords \& Phrases: data mining, parallel databases, inductive learning, knowledge discovery in databases", } @InCollection{holsheimer.ea:perspective-on:95, author = "Marcel Holsheimer and Martin L. Kersten and Heikki Mannila and Hannu Toivonen", title = "A perspective on databases and data mining", pages = "10", publisher = "Centrum voor Wiskunde en Informatica (CWI)", address = "ISSN 0169-118X", month = apr # " 30", year = "1995", keywords = "Association rules, database techniques, generalization hierarchies.", URL = "ftp://ftp.cwi.nl/pub/CWIreports/AA/CS-R9531.ps.Z", abstract = "We discuss the use of database methods for data mining. Recently impressive results have been achieved for some data mining problems using highly specialized and clever data structures. We study how well one can manage by using general purpose database management systems. We illustrate our ideas by investigating the use of a dbms for a well-researched area: the discovery of association rules. We present a simple algorithm, consisting of only union and intersection operations, and show that it achieves quite good performance on an efficient dbms. Our method can incorporate inheritance hierarchies to the association rule algorithm easily. We also present a technique that effectively reduces the number of database operations when searching large search spaces that contain only few interesting items. Our work shows that database techniques are promising for data mining: general architectures can achieve reasonable results.", note = "AA (Department of Algorithmics and Architecture)", annote = "Originally contained the following fields and values - booktitle, 128, note,CS-R9531", } @TechReport{holsheimer.ea:search:94, URL = "ftp://ftp.cwi.nl/pub/CWIreports/AA/CS-R9406.ps.Z", title = "Data Mining, The Search for Knowledge in Databases", author = "Marcel Holsheimer and Arno Siebes", pages = "88", institution = "CWI, Amsterdam", address = "PO Box 94079, 1090 GB Amsterdam, The Netherlands", keywords = "Data Mining, Knowlege Engineering, Database applications, Information search and Retrieval.", number = "CS-R9406", type = "Report", year = "1994", annote = "Large (88 pages), Includes information on ID3, AQ15, CN2, DBLearn, Meta-Dendral, Radix/Rx, Bacon and KEDS.", abstract = "Data mining is the search for relationships and global patterns that exist in large databases, but are `hidden' among the vast amounts of data, such as a relationship between patient data and their medical diagnosis. These relationships represent valuable knowledge about the database and objects in the database and, if the database is a faithful mirror, of the real world registered by the database. One of the main problems for data mining is that the number of possible relationships is very large, thus prohibiting the search for the correct ones by simple validating each of them. Hence, we need intelligent search strategies, as taken from the area of machine learning. Another important problem is that information in data objects is often corrupted or missing. Hence, statistical techniques should be applied to estimate the reliability of the discovered relationships. The report provides a survey of current data mining research, it presents the main underlying ideas, such as inductive learning, and search strategies and knowledge representations used in data mine systems. Furthermore, it describes the most important problems and their solutions, and provides an survey of research projects.", } @Article{hou:extraction-applications:96, author = "W. C. Hou", address = "So Illinois Univ, Dept Comp Sci, Carbondale, Il, 62901", title = "Extraction and applications of statistical relationships in relational databases", journal = "Ieee Trans. On Knowledge And Data Engineering", year = "1996", volume = "8", issue = "6", pages = "939--945", abstract = "In this paper, we discuss modeling and extraction of statistical relationships among attributes. Different methods are used for extraction of different types of relationships. A complete methodology for extraction is developed by integrating widely accepted statistical methods. Statistical relationships manifest embedded relationships in data and thus lend themselves naturally to estimating unknown attribute values and detecting unlikely values. We will carefully examine these applications and evaluate the usefulness of statistical relationships in these applications using a real-life database.", keywords = "data mining, estimating unknown attribute values,, integration of data mining techniques, integrity constraints, knowledge discovery in databases, statistical relationships", } @InProceedings{houtsma.ea:set-oriented-association:95, author = "M. Houtsma and A. Swami", title = "Set-Oriented Mining for Association Rules in Relational Databases", pages = "25--34", editor = "P. S. Yu and A. L. P. Chen", booktitle = "Proceedings of the 11th International Conference on Data Engineering", month = mar, publisher = "IEEE Computer Society Press", address = "Los Alamitos, CA, USA", year = "1995", } @Article{houtsma.ea:set-oriented-relational:95, author = "M. Houtsma and A. Swami", address = "Telemat Res Ctr, Pob 217, 7500 Ae Enschede, Netherlands Univ Twente, 7500 Ae Enschede, Netherlands Ibm Corp, Almaden Res Ctr, San Jose, Ca", title = "Set-oriented data mining in relational databases", journal = "Data \& Knowledge Engineering", year = "1995", volume = "17", issue = "3", pages = "245--262", abstract = "Data mining is an important real-life application for businesses. It is critical to find efficient ways of mining large data sets. In order to benefit from the experience with relational databases, a set-oriented approach to mining data is needed. In such an approach, the data mining operations are expressed in terms of relational or set-oriented operations. Query optimization technology can then be used for efficient processing. In this paper, we describe set- oriented algorithms for mining association rules. Such algorithms imply performing multiple joins and thus may appear to be inherently less efficient than special- purpose algorithms. We develop new algorithms that can be expressed as SQL queries, and discuss optimization of these algorithms. After analytical evaluation, an algorithm named SETM emerges as the algorithm of choice. Algorithm SETM uses only simple database primitives, viz., sorting and merge-scan join. Algorithm SETM is simple, fast, and stable over the range of parameter values. It is easily parallelized and we suggest several additional optimizations. The set-oriented nature of Algorithm SETM makes it possible to develop extensions easily and its performance makes it feasible to build interactive data mining tools for large databases.", keywords = "DATA MINING, OPTIMIZATION, SET-ORIENTED ALGORITHMS", } @Article{hu.ea:learning-relational:95, author = "X. H. Hu and N. Cercone", address = "Univ Regina, Dept Comp Sci, Regina, Sk S4S 0A2, Canada", title = "Learning in relational databases - a rough set approach", journal = "Computational Intelligence", year = "1995", volume = "11", issue = "2", pages = "323--338", abstract = "Knowledge discovery in databases, or data mining, is an important direction in the development of data and knowledge- based systems. Because of the huge amount of data stored in large numbers of existing databases, and because the amount of data generated in electronic forms is growing rapidly, it is necessary to develop efficient methods to extract knowledge from databases. An attribute- oriented rough set approach has been developed for knowledge discovery in databases. The method integrates machine-learning paradigm, especially learning-from-examples techniques, with rough set techniques. An attribute-oriented concept tree ascension technique is first applied in generalization, which substantially reduces the computational complexity of database learning processes. Then the cause-effect relationship among the attributes in the database is analyzed using rough set techniques, and the unimportant or irrelevant attributes are eliminated. Thus concise and strong rules with little or no redundant information can be learned efficiently. Our study shows that attribute-oriented induction combined with rough set theory provide an efficient and effective mechanism for knowledge discovery in database systems.", keywords = "KNOWLEDGE DISCOVERY IN DATABASES, MACHINE LEARNING, ROUGH SET, ATTRIBUTE-ORIENTED INDUCTION", } @InProceedings{hu.ea:rules-rough:96, author = "X. Hu and N. Cercone", title = "Mining Knowledge Rules from Databases: {A} Rough Set Approach", pages = "96--105", booktitle = "Proceedings of the 12th International Conference on Data Engineering", month = feb, publisher = "IEEE Computer Society", address = "Washington - Brussels - Tokyo", year = "1996", } @PhdThesis{hu:attribute-oriented-rough:, title = "Knowledge Discovery in Databases: An Attribute-Oriented Rough Set Approach", author = "Tony Xiaohua Hu", URL = "http://www.cs.bham.ac.uk/~anp/dm_docs/tony_xiaohua_hu.ps.gz", } @MastersThesis{hu:conceptual-clustering:93, URL = "ftp://fas.sfu.ca/pub/cs/theses/1993/XiaohuaHuMSc.ps.Z", title = "Conceptual Clustering and Concept Hierarchies in Knowledge Discovery.", author = "Xiaohua Hu", year = "1993", month = jan, abstract = "Knowledge discovery is the nontrivial extraction of implicit, previously unknown, and potentially useful information from data. Knowledge discovery from a database is a form of machine learning where the discovered knowledge is represented in a high-level language. The growth in the size and number of existing databases far exceeds human abilities to analyse the data, which creates both a need and an opportunity for extracting knowledge from databases. In this thesis, I propose two algorithms for knowledge discovery in database systems. One algorithm finds knowledge rules associated with concepts in the different levels of the conceptual hierarchy; the algorithm is developed based on earlier attribute-oriented conceptual ascension techniques. The other algorithm combines a conceptual clustering technique and machine learning. It can find three kinds of rules, characteristic rules, inheritance rules, and domain knowledge, even in the absence of a conceptual hierarchy. The two algorithms are implemented as a component of the database learning system (DBLEARN) using C under Sybase/Unix environment. The test of DBLEARN on NSERC's grant information system shows that our method can discover many meaningful knowledge rules very quickly. The application of knowledge discovery in database is very wide. I will discuss how to apply DBLEARN to a lot of data-intensified areas such as Hospital's patient information system, customer database of telephone company , airplane company and bank, inventory system of department store and so on to find some intesesting rules hidden among the data, and how the people in these companies can use these learned rules to help them.", annote = "M.Sc Thesis. From Simon Fraser University. The supervisiors are Nick Cercone and Jiawei Han. It discusses extensions to DBLearn. 88pages.", } @Article{hu:object-aggregation:94, author = "X. H. Hu", address = "Univ Regina, Dept Comp Sci, Regina S4S 0A2, Saskatchewan, Canada", title = "Object aggregation and cluster identification - a knowledge discovery approach", journal = "Applied Mathematics Letters", year = "1994", volume = "7", issue = "4", pages = "29--34", abstract = "A method for object aggregation and cluster identification has been proposed for knowledge discovery in databases. By integrating conceptual clustering and machine learning (especially learning-from- examples) paradigms, the method classifies the data into different clusters, extracts the characteristics of each cluster, and discovers knowledge rules based on the relationships among different clusters. Different kinds of knowledge rules, including hierarchical, equivalence an inheritance rules can be discovered efficiently.", keywords = "KNOWLEDGE DISCOVERY IN DATABASES, CONCEPTUAL CLUSTERING", } @InProceedings{huber:large-to:97, title = "From Large to Huge: {A} Statistician's Reactions to {KDD} \& {DM}", author = "Peter J. Huber", pages = "304", crossref = "heckerman.ea:proceedings-third:97", } @Article{hunter:acquisition-planning:90, author = "L. Hunter", title = "Knowledge acquisition planning for inference from large databases.", journal = "Hawaii Int. Conf. Sys. Sci-23", volume = "2", pages = "35--44", publisher = "IEEE", year = "1990", keywords = "AI, database data base, mining, HICSS HICSS23 HICSS90", } @TechReport{hutchinson:radial-basis:93, title = "A Radial Basis Function Approach to Financial Time Series Analysis", author = "James M. Hutchinson", institution = "Artificial Intelligence Laboratory, Massachusetts Institute of Technology (MIT)", address = "Cambridge, Massachusetts", month = dec, year = "1993", pages = "160", URL = "ftp://publications.ai.mit.edu/ai-publications/1000-1499/AITR-1457.ps.Z", abstract = "Nonlinear multivariate statistical techniques on fast computers offer the potential to capture more of the dynamics of the high dimensional, noisy systems underlying financial markets than traditional models, while making fewer restrictive assumptions. This thesis presents a collection of practical techniques to address important estimation and confidence issues for Radial Basis Function networks arising from such a data driven approach, including efficient methods for parameter estimation and pruning, a pointwise prediction error estimator, and a methodology for controlling the ``data mining'' problem. Novel applications in the finance area are described, including customized, adaptive option pricing and stock price prediction.", } @Misc{ibm:white-paper, key = "ibm:white-paper", title = "Data Mining: Extending the Information Warehouse Framework", note = "IBM white paper on data mining", URL = "http://booksrv2.raleigh.ibm.com/cgi-bin/bookmgr/bookmgr.cmd/BOOKS/datamine", } @Unpublished{icebreaker:mining-data:96, title = "Mining Data", author = "IceBreaker", URL = "http://www.bdt.com/icemfg/ice_it.htm", keywords = "Data Mining", month = sep, year = "1996", } @InProceedings{iglesia.ea:discovering-commercial:96, title = "Discovering Knowledge in Commercial Databases Using Modern Heuristic Techniques", pages = "44", author = "B. de la Iglesia and J. C. W. Debuse and V. J. Rayward-Smth", crossref = "simoudis.ea:proceedings-second:96", } @Article{ijis-special-issue:92, key = "ijis-special-issue:92", title = "{IJIS} Special issue on Knowledge Discovery in Databases and Knowledge Bases", journal = "International Journal of Intelligent Systems", year = "1992", volume = "7", number = "7", month = sep, editor = "G. Piatetsky-Shapiro (guest editor)", note = "Special issue on Knowledge Discovery in Databases and Knowledge Bases, edited selection of best papers from AAAI KDD-91 workshop", } @Article{imielinski.ea:database-perspective:96, author = "T. Imielinski and H. Mannila", address = "Rutgers State Univ, Dept Comp Sci, New Brunswick, Nj, 08903 Univ Helsinki, Fin-00014 Helsinki, Finland", title = "A database perspective on knowledge discovery", journal = "Comm. Of The Acm", year = "1996", volume = "39", issue = "11", pages = "58--64", } @InProceedings{imielinski.ea:datamine-application:96, title = "DataMine: Application Programming Interface and Query Language for Database Mining", pages = "256", author = "Tomasz Imielinski and Aashu Virmani and Amin Abdulghani", crossref = "simoudis.ea:proceedings-second:96", } @Book{inmon.ea:understanding-pattern:91, title = "Understanding Data Pattern Processing", author = "W. H. Inmon and S. Osterfelt", publisher = "QED Techincal Publishing Group", year = "1991", address = "Wellesley, MA.", annote = "Piatetsky : a business-oriented, nontechnical book", } @Article{inmon:warehouse:96, author = "W. H. Inmon", title = "The Data Warehouse and Data Mining", journal = "Communications of the ACM", volume = "39", number = "11", pages = "49--50", month = nov, year = "1996", ISSN = "0001-0782", } @InProceedings{ittner.ea:relevant-new:96, title = "Discovery of Relevant New Features by Generating Non-Linear Decision Trees", pages = "108", author = "Andreas Ittner and Michael Schlosser", crossref = "simoudis.ea:proceedings-second:96", } @Article{j.ea:data-driven-quantitative:93, author = "Han. J. and Y. Cai and N. Cercone", title = "Data-Driven Discovery of Quantitative Ruels in Relational Databases", journal = "IEEE Transactions on Knowledge and Data Engineering", pages = "29--40", volume = "5", number = "1", month = feb, year = "1993", } @InProceedings{jensen.ea:adjusting-multiple:97, title = "Adjusting for Multiple Comparisons in Decision Tree Pruning", author = "David Jensen and Matt Schmill", pages = "195", crossref = "heckerman.ea:proceedings-third:97", } @InProceedings{john.ea:sipping-firehose:97, title = "{SIP}ping from the Data Firehose", author = "George H. John and Brian Lent", pages = "199", crossref = "heckerman.ea:proceedings-third:97", } @InProceedings{john.ea:static-versus:96, title = "Static Versus Dynamic Sampling for Data Mining", pages = "367", author = "George H. John and Pat Langley", crossref = "simoudis.ea:proceedings-second:96", } @Article{john.ea:stock-selection:96, author = "G. H. John and P. Miller and R. Kerber", address = "Ibm Corp, Data Min Grp, Armonk, Ny, 10504 Stanford Univ, Dept Comp Sci, Stanford, Ca, 94305 Lockheed Martin Corp, Ctr Artificial Intelligence, Palo Alto, Ca, 94304", title = "Stock selection using rule induction", journal = "Ieee Expert-Intelligent Systems \& Their Applications", year = "1996", volume = "11", issue = "5", pages = "52--58", } @InCollection{jong:genetic-algorithm-based-learning:90, author = "Kenneth De Jong", title = "Genetic-algorithm-based learning", crossref = "kodratoff.ea:machine-learning:90", pages = "611--638", } @PhdThesis{kadie:seer-maximum:, author = "Ph.D. Carl Myers Kadie", title = "Seer: Maximum Likelihood Regression for Learning-Speed Curves", URL = "ftp://ftp.cs.uiuc.edu/pub/TechReports/UIUCDCS-R-95-1874.ps.Z", school = "Department of Computer Science, University of Illinois at Urbana-Champaign.", annote = "The research presented here focuses on modeling machine-learning performance", } @InProceedings{kahng.ea:generalized-term:97, title = "Mining Generalized Term Associations: Count Propagation Algorithm", author = "Wen-Hsiang Kevin Liao Jonghyun Kahng and Dennis McLeod", pages = "203", crossref = "heckerman.ea:proceedings-third:97", } @InProceedings{kamber.ea:metarule-guided-multi-dimensional:97, title = "Metarule-Guided Mining of Multi-Dimensional Association Rules Using Data Cubes", author = "Micheline Kamber and Jiawei Han and Jenny Y. Chiang", pages = "207", crossref = "heckerman.ea:proceedings-third:97", } @Article{kantola.ea:discovering-functional:92, author = "M. Kantola and H. Mannila and K. J. Raiha and H. Siirtola", address = "Univ Tampere, Tampere, Finland Univ Helsinki, Sf-00100 Helsinki 10, Finland", title = "Discovering functional and inclusion dependencies in relational databases", journal = "International J. Of Intelligent Systems", year = "1992", volume = "7", issue = "7", pages = "591--607", abstract = "We consider the problem of discovering the functional and inclusion dependencies that a given database instance satisfies. This technique is used in a database design tool that uses example databases to give feedback to the designer. If the examples show deficiencies in the design, the designer can directly modify the examples. The tool then infers new dependencies and the database schema can be modified, if necessary. The discovery of the functional and inclusion dependencies can also be used in analyzing an existing database. The problem of inferring functional dependencies has several connections to other topics in knowledge discovery and machine learning. In this article we discuss the use of examples in the design of databases, and give an overview of the complexity results and algorithms that have been developed for this problem.", keywords = "DESIGN", } @InProceedings{kargupta.ea:scalable-distributed:97, title = "Scalable, Distributed Data Mining-An Agent Architecture", author = "Hillol Kargupta and Ilker Hamzaoglu and Brian Stafford", pages = "211", crossref = "heckerman.ea:proceedings-third:97", } @InCollection{kaufman.ea:goals-general:91, crossref = "piatetsky-shapiro.ea:knowledge-discovery:91", editor = "Gregory Piatetsky-Shapiro and William J. Frawley", booktitle = "Knowledge Discovery in Databases", publisher = "AAAI Press / The MIT Press", address = "Menlo Park, California", edition = "1st", year = "1991", author = "Kenneth A. Kaufman and Ryszard S. Michalski and Larry Kerschberg", title = "Mining for Knowledge in Databases: Goals and General Description of the {INLEN} system", } @InProceedings{kaufman.ea:method-reasoning:96, title = "A Method for Reasoning with Structured and Continuous Attributes in the {INLEN}-2 Multistrategy Knowledge Discovery System", pages = "232", author = "Kenneth A. Kaufman and Ryszard S. Michalski", crossref = "simoudis.ea:proceedings-second:96", } @InProceedings{kearney.ea:reverse-engineering:96, title = "Reverse Engineering Databases for Knowledge Discovery", pages = "375", author = "Stephen Mc Kearney and Huw Roberts", crossref = "simoudis.ea:proceedings-second:96", } @TechReport{keim.ea:supporting-large:93, author = "D. A. Keim and H.-P. Kriegel and T. Seidl", title = "Supporting Data Mining of Large Databases by Visual Feedback Queries", address = "Muenchen", year = "1993", descriptor = "Anfrage-Bearbeitung, Benutzerschnittstelle, Datenbank, Feedback, Visualisierungskomponente", } @InProceedings{keim.ea:supporting-large:94, author = "D. A. Keim and H.-P. Kriegel and T. Seidl", title = "Supporting Data Mining of Large Databases by Visual Feedback Queries", pages = "302--313", editor = "Ahmed K. Elmagarmid and Erich Neuhold", booktitle = "Proceedings of the 10th International Conference on Data Engineering", address = "Houston, TX", month = feb, year = "1994", URL = "http://www.dbs.informatik.uni-muenchen.de/dbs/projekt/papers/datamining.ps", publisher = "IEEE Computer Society Press", } @Article{keim.ea:techniques-large:96, author = "D. A. Keim and H. P. Kriegel", address = "Univ Munich, Inst Comp Sci, Oettingenstr 67, D-80538 Munich, Germany", title = "Visualization techniques for mining large databases: a comparison", journal = "Ieee Trans. On Knowledge And Data Engineering", year = "1996", month = dec, volume = "8", issue = "6", pages = "923--938", abstract = "Visual data mining techniques have proven to be of high value in exploratory data analysis, and they also have a high potential for mining large databases. In this article, we describe and evaluate a new visualization-based approach to mining large databases. The basic idea of our visual data mining techniques is to represent as many data items as possible on the screen at the same time by mapping each data value to a pixel of the screen and arranging the pixels adequately. The major goal of this article is to evaluate our visual data mining techniques and to compare them to other well-known visualization techniques for multidimensional data. the parallel coordinate and stick figure visualization techniques. For the evaluation of visual data mining techniques, in the first place the perception of properties of the data counts, and only in the second place the CPU time and the number of secondary storage accesses are important. In addition to testing the visualization techniques using real data, we developed a testing environment for database visualizations similar to the benchmark approach used for comparing the performance of database systems. The testing environment allows the generation of test data sets with predefined data characteristics which are important for comparing the perceptual abilities of visual data mining techniques.", keywords = "SPACE, data mining, explorative data analysis, visualizing large databases, visualizing multidimensional, multivariate data", } @Article{keim.ea:using-to:94, author = "D. A. Keim and H.-P. Kriegel", title = "Using Visualization to Support Data Mining of Large Existing Databases", journal = "Lecture Notes in Computer Science", volume = "871", pages = "210--??", year = "1994", ISSN = "0302-9743", } @Article{keim.ea:visdb-database:94, author = "D. A. Keim and H. Kriegel", title = "Vis{DB}: Database Exploration using Multidimensional Visualization", journal = "Computer Graphics and Applications", year = "1994", URL = "http://www.dbs.informatik.uni-muenchen.de/dbs/projekt/papers/visdb.ps", } @InProceedings{keim:databases-and:96, author = "D. A. Keim", title = "Databases and Visualization", note = "Tutorial", booktitle = "Proc. ACM SIGMOD Int. Conf. on Management of Data", address = "Montreal, Canada", year = "1996", URL = "http://www.dbs.informatik.uni-muenchen.de/~daniel/Sigmod96TutorialNotes.ps", annote = "Comprehensive tutorial on Database visualisation for exploratory analysis", } @Article{keim:pixel-oriented-database:96, author = "D. A. Keim", title = "Pixel-Oriented Database Visualizations", journal = "SIGMOD Record (ACM Special Interest Group on Management of Data)", volume = "25", number = "4", pages = "35--39", month = dec, year = "1996", } @Article{keim:pixel-oriented-techniques:96, author = "D. A. Keim", title = "Pixel-oriented Visualization Techniques for Exploring Very Large Databases", journal = "Journal of Computational and Graphical Statistics", number = "March", year = "1996", URL = "http://www.dbs.informatik.uni-muenchen.de/dbs/projekt/papers/StatisticsPaper.ps", } @InProceedings{keogh.ea:probabilistic-approach:97, title = "A Probabilistic Approach to Fast Pattern Matching in Time Series Databases", author = "Eamonn Keogh and Padhraic Smyth", pages = "24", crossref = "heckerman.ea:proceedings-third:97", } @InCollection{kersten.ea:on-symbiosis:95, author = "Martin L. Kersten and Marcel Holsheimer", title = "On the symbiosis of a data mining environment and a {DBMS}", pages = "12", publisher = "Centrum voor Wiskunde en Informatica (CWI)", address = "ISSN 0169-118X", month = mar # " 30", year = "1995", keywords = "data mining, parallel databases, knoewledge discovery in databases.", URL = "ftp://ftp.cwi.nl/pub/CWIreports/AA/CS-R9521.ps.Z", abstract = "One of the main obstacles in applying data mining techniques to large, real-world databases is the lack of efficient data management. In this paper, we outline a two-level architecture, consisting of a mining tool and a database server. Key elements in its success are a clear separation of concerns: the mining tool organizes and controls the search process, while all data-handling is performed by the parallel main memory DBMS. Data is stored as a set of binary tables. The interaction consists of queries for statistical information. Properties of the DBMS and the search algorithm are exploited for optimization of the data handling. In particular, results of previous computations are re-used, and I/O activity is reduced by keeping a small hot-set of binary tables in main-memory. As test results show, this system handles large datasets at a competitive performance.", note = "AA (Department of Algorithmics and Architecture)", annote = "Originally contained the fields and values - note,CS-R9521 booktitle,92", } @InProceedings{ketterlin:clustering-sequences:97, title = "Clustering Sequences of Complex Objects", author = "A. Ketterlin", pages = "215", crossref = "heckerman.ea:proceedings-third:97", } @Article{khaw.ea:privacy-response:95, author = "Y. T. Khaw and H. Y. Lee", address = "Natl Comp Board, Inst Informat Technol, Ncb Bldg, 71 Sci Pk Dr, Singapore 0511, Singapore", title = "Privacy and knowledge discovery - a response", journal = "Ieee Expert-Intelligent Systems \& Their Applications", year = "1995", volume = "10", issue = "2", pages = "58--58", } @TechReport{kivinen.ea:learning-rules:93, author = "Jyrki Kivinen and Heikki Mannila and Esko Ukkonen", title = "Learning rules with local exceptions", year = "1993", institution = "University of Helsinki", } @InProceedings{klemettinen.ea:finding-interesting:94, author = "Mika Klemettinen and Heikki Mannila and Pirjo Ronkainen and Hannu Toivonen and A. Inkeri Verkamo", booktitle = "Third International Conference on Information and Knowledge Management (CIKM'94)", title = "Finding interesting rules from large sets of discovered association rules", year = "1994", URL = "ftp://ftp.cs.helsinki.fi/pub/Reports/by_Project/PMDM/Finding_Interesting_Rules_from_Large_Sets_of_Discovered_Association_Rules.ps.gz", editor = "Nabil R. Adam and Bharat K. Bhargava and Yelena Yesha", keywords = "Knowledge discovery, Data mining, Association rules, Rule selection, Visualization", month = nov, pages = "401--407", publisher = "ACM Press", abstract = "Association rules, introduced by Agrawal, Imielinski, and Swami, are rules of the form ``for 90 \% of the rows of the relation, if the row has value 1 in the columns in set $W$, then it has 1 also in column $B$''. Efficient methods exist for discovering association rules from large collections of data. The number of discovered rules can, however, be so large that browsing the rule set and finding interesting rules from it can be quite difficult for the user. We show how a simple formalism of {\em rule templates} makes it possible to easily describe the structure of interesting rules. We also give examples of visualization of rules, and show how a visualization tool interfaces with rule templates.", } @Article{kloesgen:knowledge-discovery:96, author = "W. Kloesgen", title = "Knowledge discovery in databases and data mining", journal = "Lecture Notes in Computer Science", volume = "1079", pages = "623--??", year = "1996", ISSN = "0302-9743", } @TechReport{klosgen:efficient-interesting:93, author = "Willi Kl{\"o}sgen", title = "Efficient Discovery of Interesting statements in Databases", institution = "GMD", year = "1993", } @Article{klosgen:problems-their:92, crossref = "ijis-special-issue:92", author = "W. Klosgen", address = "German Natl Res Ctr Comp Sci, St Augustin 1, Germany", title = "Problems for knowledge discovery in databases and their treatment in the statistics interpreter explora", journal = "International J. Of Intelligent Systems", year = "1992", volume = "7", issue = "7", pages = "649--673", abstract = "In this article we describe some goals and problems of KDD. Approaches are presented which have been implemented in the Statistics Interpreter Explora, a prototype assistant system for discovering interesting findings in recurrent datasets. We introduce patterns to identify what is interesting in data and give some examples of patterns for difference-, change-, and trend-detection. Then we summarize what must be specified to define a pattern. Besides some descriptive parts, this includes a procedural verification method. Object-oriented programming techniques can simplify the specializations of general patterns. We identify search as a constituent principle of discovery and introduce object structures as a basis to induce a graph structure on the search space. We mention several strategies for graph search and describe approaches for dealing with the aggregation, redundancy, and overlapping problems. Then we address the presentation of findings in natural language and graphical form, focusing on the methods to design good graphical presentations by knowledge-based techniques. Finally, we discuss the paradigm of an adaptive discovery assistant, including the problem of how to reuse the discovered knowledge for further discovery.", } @Article{knight:what-makes:96, author = "K. Knight", address = "Univ So Calif, Inst Sci Informat, 4676 Admiralty Way, Marina Del Rey, Ca, 90292 Univ Massachusetts, Dept Comp Sci, Amherst, Ma, 01003 Inst Study Learning \& Expertise, Palo Alto, Ca, 94306 Stanford Univ, Stanford, Ca, 94305", title = "What makes a compelling empirical-evaluation", journal = "Ieee Expert-Intelligent Systems \& Their Applications", year = "1996", volume = "11", issue = "5", pages = "10--14", } @InProceedings{knobbe.ea:analysing-binary:96, title = "Analysing Binary Associations", pages = "311", author = "Arno J. Knobbe and Pieter W. Adriaans", crossref = "simoudis.ea:proceedings-second:96", } @InProceedings{knorr.ea:extraction-spatial:96, title = "Extraction of Spatial Proximity Patterns by Concept Generalization", pages = "347", author = "Edwin M. Knorr and Raymond T. Ng", crossref = "simoudis.ea:proceedings-second:96", } @Article{knorr.ea:finding-aggregate:96, author = "E. M. Knorr and R. T. Ng", address = "Univ British Columbia, Dept Comp Sci, 2366 Main Mall, Vancouver, Bc V6T 1Z4, Canada", title = "Finding aggregate proximity relationships and commonalities in spatial data mining", journal = "Ieee Trans. On Knowledge And Data Engineering", year = "1996", month = dec, volume = "8", issue = "6", pages = "884--897", abstract = "In this paper, we study two spatial knowledge discovery problems involving proximity relationships between clusters and features. The first problem is: Given a cluster of points, how can we efficiently find features (represented as polygons) that are closest to the majority of points in the cluster? We measure proximity in an aggregate sense due to the nonuniform distribution of points in a cluster (e.g., houses on a map), and the different shapes and sizes of features (e.g., natural or man-made geographic features). The second problem is: Given n clusters of points, how can we extract the aggregate proximity commonalities (i.e., features) that apply to most, if not all, of the n clusters? Regarding the first problem, the main contribution of the paper is the development of Algorithm CRH which uses geometric approximations (i.e., circles, rectangles, and convex hulls) to filter and select features. Highly scalable and incremental, Algorithm CRH can examine over 50,000 features and their spatial relationships with a given cluster in approximately one second of CPU time. Regarding the second problem, the key contribution is the development of Algorithm GenCom that makes use of concept generalization to effectively derive many meaningful commonalities that cannot be found otherwise.", keywords = "spatial knowledge discovery, concept generalization, proximity relationships, geometric filtering, GIS", } @InProceedings{knorr.ea:unified-notion:97, title = "A Unified Notion of Outliers: Properties and Computation", author = "Edwin M. Knorr and Raymond T. Ng", pages = "219", crossref = "heckerman.ea:proceedings-third:97", } @Book{kodratoff.ea:machine-learning:90, editor = "Yves Kodratoff and Ryszard S. Michalski", title = "Machine Learning, an {Artificial Intelligence} approach", publisher = "Morgan Kaufmann", year = "1990", volume = "3", address = "San Mateo, California", } @InProceedings{kohavi.ea:automatic-parameter:95, author = "Ron Kohavi and George John", title = "Automatic Parameter Selection by Minimizing Estimated Error", booktitle = "Machine Learning: Proceedings of the Twelfth International Conference", month = jul, publisher = "Morgan Kaufmann", editor = "Armand Prieditis and Stuart Russell", pages = "304--312", URL = "http://robotics.stanford.edu/users/ronnyk", year = "1995", contributedby = "Ronny Kohavi, ronnyk(at)sgi.com", } @InProceedings{kohavi.ea:bias-plus:96, author = "Ron Kohavi and David H. Wolpert", title = "Bias Plus Variance Decomposition for Zero-One Loss Functions", booktitle = "Machine Learning: Proceedings of the Thirteenth International Conference", year = "1996", publisher = "Morgan Kaufmann", editor = "Lorenza Saitta", pages = "275--283", URL = "http://robotics.stanford.edu/users/ronnyk", month = jul, contributedby = "Ronny Kohavi, ronnyk(at)sgi.com", } @InProceedings{kohavi.ea:error-based-entropy-based:96, author = "Ron Kohavi and Mehran Sahami", title = "Error-Based and Entropy-Based Discretization of Continuous Features", booktitle = "Proceedings of the Second International Conference on Knowledge Discovery and Data Mining", pages = "114--119", URL = "http://robotics.stanford.edu/users/ronnyk", url2 = "ftp://starry.stanford.edu/pub/ronnyk/disc2.ps", year = "1996", contributedby = "Ronny Kohavi, ronnyk(at)sgi.com", affiliation = "Silicon Graphics Inc.; Stanford University", crossref = "simoudis.ea:proceedings-second:96", } @InProceedings{kohavi.ea:feature-subset:95, author = "Ron Kohavi and Sommerfield Dan", booktitle = "First International Conference on Knowledge Discovery and Data Mining (KDD-95)", title = "Feature Subset Selection Using the Wrapper Method: Overfitting and Dynamic Search Space Topology", year = "1995", URL = "ftp://starry.stanford.edu/pub/ronnyk/fssWrapper.ps", editor = "Usama M Fayyad and Ramasamy Uthurusamy", keywords = "feature subset selection relevant/irrelevant features accuracy estimation, cross-validation", month = aug, } @InProceedings{kohavi.ea:option-decision:97, author = "Ron Kohavi and Clayton Kunz", title = "Option Decision Trees with Majority Votes", booktitle = "Machine Learning: Proceedings of the Fourteenth International Conference", year = "1997", publisher = "Morgan Kaufmann Publishers, Inc.", editor = "Doug Fisher", URL = "http://robotics.stanford.edu/users/ronnyk", month = jul, contributedby = "Ronny Kohavi, ronnyk(at)sgi.com", } @InProceedings{kohavi.ea:using-mlc:96, author = "Ron Kohavi and Dan Sommerfield and James Dougherty", title = "Data Mining Using {MLC}++: {A} Machine Learning Library in {C}++", booktitle = "Tools with Artificial Intelligence", year = "1996", pages = "234--245", note = "Received the best paper award", publisher = "IEEE Computer Society Press", URL = "ftp://starry.stanford.edu/pub/ronnyk/mlc96.ps.Z", contributedby = "Ronny Kohavi, ronnyk(at)sgi.com", } @Article{kohavi.ea:wrappers-feature:, author = "Ron Kohavi and George H. John", title = "Wrappers for Feature Subset Selection", journal = "Artificial Intelligence", URL = "http://robotics.stanford.edu/users/ronnyk", year = "(to appear)", contributedby = "Ronny Kohavi, ronnyk(at)sgi.com", } @InProceedings{kohavi:scaling-up:96, author = "Ron Kohavi", title = "Scaling Up the Accuracy of {N}aive-{B}ayes Classifiers: a Decision-Tree Hybrid", booktitle = "Proceedings of the Second International Conference on Knowledge Discovery and Data Mining", url2 = "http://robotics.stanford.edu/users/ronnyk", URL = "ftp://starry.stanford.edu/pub/ronnyk/nbtree.ps", pages = "202--207", year = "1996", contributedby = "Ronny Kohavi, ronnyk(at)sgi.com", crossref = "simoudis.ea:proceedings-second:96", } @PhdThesis{kohavi:wrappers-performance:95, author = "Ron Kohavi", title = "Wrappers for Performance Enhancement and Oblivious Decision Graphs", year = "1995", address = "Computer Science department", note = "STAN-CS-TR-95-1560", URL = "ftp://starry.stanford.edu/pub/ronnyk", school = "Stanford University", contributedby = "Ronny Kohavi, ronnyk(at)sgi.com", } @InProceedings{kontkanen.ea:predictive-with:96, title = "Predictive Data Mining with Finite Mixtures", pages = "176", author = "Petri Kontkanen and Petri Myllymaki and Henry Tirri", crossref = "simoudis.ea:proceedings-second:96", } @Article{koperski.ea:spatial-association:95, author = "K. Koperski and J. W. Han", address = "Simon Fraser Univ, Sch Comp Sci, Burnaby, Bc V5A 1S6, Canada", title = "Discovery of spatial association rules in geographic information databases", journal = "Lecture Notes In Computer Science", year = "1995", volume = "951", pages = "47--66", abstract = "Spatial data mining, i.e., discovery of interesting, implicit knowledge in spatial databases, is an important task for understanding and use of spatial data- and knowledge- bases. In this paper, an efficient method for mining strong spatial association rules in geographic information databases is proposed and studied. A spatial association rule is a rule indicating certain association relationship among a set of spatial and possibly some nonspatial predicates. A strong rule indicates that the patterns in the rule have relatively frequent occurrences in the database and strong implication relationships. Several optimization techniques are explored, including a two-step spatial computation technique (approximate computation on large sets, and refined computations on small promising patterns), shared processing in the derivation of large predicates at multiple concept levels, etc. Our analysis shows that interesting association rules can be discovered efficiently in large spatial databases.", } @TechReport{korn.ea:quantifiable-using:97, author = "Flip Korn and Alexandros Labrinidis and Yannis Kotidis and Christos Faloutsos and Alex Kaplunovich and Dejan Perkovic", title = "Quantifiable Data Mining Using Principal Component Analysis", institution = "University of Maryland Institute for Advanced Computer Studies Dept. of Computer Science, Univ. of Maryland", number = "CS-TR-3754", address = "College Park, MD", month = feb, year = "1997", URL = "ftp://ftp.cs.umd.edu/pub/papers/papers/3754/3754.ps.Z", abstract = "Association Rule Mining algorithms operate on a data matrix (e.g., customers x products) to derive rules. We propose a single-pass algorithm for mining linear rules in such a matrix based on Principal Component Analysis. PCA detects correlated columns of the matrix, which correspond to, e.g., products that sell together.\par The first contribution of this work is that we propose to quantify the ``goodness'' of a set of discovered rules. We define the ``guessing error'': the root-mean-square error of the reconstructed values of the cells of the given matrix, when we pretend that they are unknown. The second contribution is a novel method to guess missing/hidden values from the linear rules that our method derives. For example, if somebody bought \$10 of milk and \$3 of bread, our rules can ``guess'' the amount spent on, say, butter. Thus, we can perform a variety of important tasks such as forecasting, `what-if' scenarios, outlier detection, and visualization. Moreover, we show that we can compute the principal components with a single pass over the dataset.\par Experiments on real datasets (e.g., NBA statistics) demonstrate that the proposed method consistently achieves a ``guessing error'' of up to 5 times lower than the straightforward competitor.\par (Also cross-referenced as UMIACS-TR-97-13)", } @InProceedings{kramer.ea:causes-cancer:97, title = "Mining for Causes of Cancer: Machine Learning Experiments at Various Levels of Detail", author = "Stefan Kramer and Bernhard Pfahringer and Christoph Helma", pages = "223", crossref = "heckerman.ea:proceedings-third:97", } @InProceedings{kramer.ea:efficient-search:96, title = "Efficient Search for Strong Partial Determinations", pages = "371", author = "Stefan Kramer and Bernhard Pfahringer", crossref = "simoudis.ea:proceedings-second:96", } @InProceedings{kranakis.ea:complexity-on:96, author = "Evangelos Kranakis and Danny Krizanc and Andrzej Pelc and David Peleg", title = "The Complexity of Data Mining on the Web", pages = "153--153", booktitle = "Proceedings of the 15th Annual {ACM} Symposium on Principles of Distributed Computing", month = may, publisher = "ACM", address = "New York", year = "1996", } @Article{krivda:data-mining-dynamite:95, author = "Cheryl D. Krivda", title = "Data-Mining Dynamite --- Supercharge your data-mining projects with data cleansing, data warehouses, parallel processing, and mega-storage", journal = "Byte Magazine", volume = "20", number = "10", pages = "97--??", month = oct, year = "1995", ISSN = "0360-5280", } @Article{krivda:unearthing-underground:96, author = "Cheryl D Krivda", title = "Unearthing Underground Data", journal = "LAN Magazine", year = "1996", note = "May 20 - June 2", URL = "http://www.lanmag.com/9605mine.htm", } @InProceedings{laer.ea:multi-class-problems:96, author = "W. Van Laer and S. D\v{z}eroski and L. De Raedt", title = "Multi-class problems and discretization in {ICL}", booktitle = "Proceedings of the MLnet Familiarization Workshop on Data Mining with Inductive Logic Programing", pages = "53--60", year = "1996", } @InProceedings{lagus.ea:self-organizing-maps:96, title = "Self-Organizing Maps of Document Collections: {A} New Approach to Interactive Exploration", pages = "238", author = "Krista Lagus and Timo Honkela and Samuel Kaski and Teuvo Kohonen", crossref = "simoudis.ea:proceedings-second:96", } @InProceedings{laird:discrete-sequence:92, author = "P. Laird", title = "Discrete Sequence Prediction and Its Applications", year = "1992", booktitle = "Proc.\ of AAAI-92", pages = "135--140", } @InProceedings{lakshminarayan.ea:imputation-missing:96, title = "Imputation of Missing Data Using Machine Learning Techniques", pages = "140", author = "Kamakshi Lakshminarayan and Steven A. Harp and Robert Goldman and Tariq Samad", crossref = "simoudis.ea:proceedings-second:96", } @InProceedings{lange:empirical-test:96, title = "An Empirical Test of the Weighted Effect Approach to Generalized Prediction Using Recursive Neural Nets", pages = "183", author = "Rense Lange", crossref = "simoudis.ea:proceedings-second:96", } @Article{langley.ea:data-driven-approaches:89, author = "Pat Langley and Jan M. Zytkow", title = "Data-Driven Approaches to Empirical Discovery", journal = "Artificial Intelligence", pages = "283--312", volume = "40", month = sep, year = "1989", } @InCollection{langley.ea:rediscovering-chemistry:86, author = "Pat Langley and Gary L. Bradshaw and Herbert A. Simon", title = "Rediscovering chemistry with the {Bacon} system", crossref = "michalski.ea:machine-learning:86", pages = "307--329", } @InCollection{langley.ea:search-regularity:86, author = "Pat Langley and Jan M. Zytkow and Herbert A. Simon and Gary L. Bradshaw", title = "The Search for Regularity: Four Aspects of Scientific Discovery", crossref = "michalski.ea:machine-learning:86", pages = "425--469", } @InProceedings{langley:induction-condensed:96, title = "Induction of Condensed Determinations", pages = "327", author = "Pat Langley", crossref = "simoudis.ea:proceedings-second:96", } @InProceedings{lee.ea:context-sensitive-discretization:94, author = "Changhwan Lee and Dong-Guk Shin", title = "A Context-Sensitive Discretization of Numeric Attributes for Classification Learning", booktitle = "ECAI 94. Proceeding of the 11th European Conference on Artificial Intelligence", publisher = "John Wiley and Sons, Ltd", year = "1994", pages = "428--432", } @Article{lee.ea:database-summarization:97, author = "D. H. Lee and M. H. Kim", address = "Chonnam Natl Univ, Dept Comp Sci, Kwangju, South Korea Korea Adv Inst Sci \& Technol, Dept Comp Sci, Taejon 305701, South Korea", title = "Database summarization using fuzzy isa hierarchies", journal = "Ieee Trans. On Systems Man And Cybernetics Part B- Cybernetics", year = "1997", volume = "27", issue = "1", pages = "68--78", abstract = "Summ. discovery is one of the major components of knowledge discovery in databases, which provides the user with comprehensive information for grasping the essence from a large amount of information in a database. In this paper, we propose an interactive top-down summary discovery process which utilizes fuzzy ISA hierarchies as domain knowledge. We define a generalized tuple as a representational form of a database summary including fuzzy concepts. By virtue of fuzzy ISA hierarchies where fuzzy ISA relationships common in actual domains are naturally expressed, the discovery process comes up with more accurate database summaries, We also present an informativeness measure for distinguishing generalized tuples that delivers much information to users, based on Shannon's information theory.", keywords = "data mining, fuzzy set application, summary discovery", } @InProceedings{lee.ea:hypothesis-refinement:93, author = "Do Heon Lee and Myoung Ho Kim", title = "A Hypothesis Refinement Method for Summary Discovery in Databases", pages = "274--282", editor = "Bharat Bhargava and Timothy Finin and Yelena Yesha", booktitle = "Proceedings of the 2nd International Conference on Information and Knowledge Management", month = nov, publisher = "ACM Press", address = "New York, NY, USA", year = "1993", } @Article{lee.ea:support:96, author = "H. Y. Lee and H. L. Ong", address = "Inst Informat Technol, Japan Singapore Artificial Intelligence Ctr, 11 Sci Pk Rd, Singapore 117685, Singapore", title = "Visualization support for data mining", journal = "Ieee Expert-Intelligent Systems \& Their Applications", year = "1996", volume = "11", issue = "5", pages = "69--75", annote = "Discusses Winvis System", URL = "http://jsaic.iti.gov.sg/pubs/papers/papers_archive/IEEEpub.zip", keywords = "visualisation, Winviz, parallel co-ordinates", } @Article{lenat:eurisko-program:83, author = "D. Lenat", title = "{EURISKO}: {A} program that learns new heuristics and domain concepts. {T}he nature of heuristics {III}: Background and examples", journal = "Artificial Intelligence", year = "1983", pages = "61--98", volume = "21", } @InProceedings{lent.ea:discovering-trends:97, title = "Discovering Trends in Text Databases", author = "Brian Lent and Rakesh Agrawal and Ramakrishnan Srikant", pages = "227", crossref = "heckerman.ea:proceedings-third:97", } @InProceedings{lim.ea:framework-integrating:92, author = "J. H. Lim and H. C. Lui and P. Z. Wang and", title = "A Framework for Integrating Fault Diagnosis and Incremental Knowledge Acquisition in Connectionist Expert Systems", year = "1992", booktitle = "Proceedings of AAAI-92", pages = "159--164", } @Article{limb.ea:tools-techniques:94, author = "P. R. Limb and G. J. Meggs", address = "British Telecommun Labs, Martlesham Heath, Ipswich 1P5 7Re, Suffolk, England", title = "Data mining - tools and techniques", journal = "Bt Technology J.", year = "1994", volume = "12", issue = "4", pages = "32--41", abstract = "The amount of data collected by large telecommunications companies like BT is vast. In order to turn this voluminous data into valuable information it is necessary to apply analysis techniques to build models and characteristics of data. This paper gives an overview of a range of techniques used for data analysis collectively known as data mining. Three broad categories of data mining techniques are suggested and the reader is introduced to popular algorithms within each category. References to additional algorithms are also presented so that the reader may gain more detailed information if required.", } @Article{liu.ea:dimensionality-reduction:96, author = "H. Liu and R. Setiono", address = "Natl Univ Singapore, Dept Informat Syst \& Comp Sci, Singapore 0511, Singapore", title = "Dimensionality reduction via discretization", journal = "Knowledge-Based Systems", year = "1996", volume = "9", issue = "1", pages = "67--72", abstract = "The existence of numeric data and large numbers of records in a database present a challenging task in terms of explicit concepts extraction from the raw data. The paper introduces a method that reduces data vertically and horizontally, keeps the discriminating power of the original data, and paves the way for extracting concepts. The method is based on discretization (vertical reduction) and feature selection (horizontal reduction). The experimental results show that (a) the data can be effectively reduced by the proposed method; (b) the predictive accuracy of a classifier (C4.5) can be improved ai-ter data and dimensionality reduction; and (c) the classification rules learned are simpler.", keywords = "DIMENSIONALITY REDUCTION, DISCRETIZATION, KNOWLEDGE DISCOVERY", } @InProceedings{liu.ea:using-general:97, title = "Using General Impressions to Analyze Discovered Classification Rules", author = "Bing Liu and Wynne Hsu and Shu Chen", pages = "31", crossref = "heckerman.ea:proceedings-third:97", } @Article{long.ea:expert-systems:92, author = "J. M. Long and J. R. Slagle", address = "Univ Minnesota, Dept Surg, Box 290 Umhc, 420 Delaware St Se, Minneapolis, Mn, 55455", title = "Expert systems, clinical-data analyses, and knowledge discovery - the posch ai project", journal = "Annals Of The New York Academy Of Sciences", year = "1992", volume = "670", pages = "146--154", } @InProceedings{lorenzo:application-clausal:96, author = "D. Lorenzo", title = "Application of Clausal Discovery to Temporal Databases", booktitle = "Proceedings of the MLnet Familiarization Workshop on Data Mining with Inductive Logic Programing", pages = "25--40", year = "1996", } @Article{lu.ea:effective-using:96, author = "H. J. Lu and R. Setiono and H. Liu", address = "Natl Univ Singapore, Dept Informat Syst \& Comp Sci, Lower Kent Ridge Rd, Singapore 119260, Singapore", title = "Effective data mining using neural networks", journal = "Ieee Trans. On Knowledge And Data Engineering", year = "1996", volume = "8", issue = "6", pages = "957--961", abstract = "Classification is one of the data mining problems receiving great attention recently in the database community. This paper presents an approach to discover symbolic classification rules using neural networks. Neural networks have not been thought suited for data mining because how the classifications were made is not explicitly stated as symbolic rules that are suitable for verification or interpretation by humans. With the proposed approach, concise symbolic rules with high accuracy can be extracted from a neural network. The network is first trained to achieve the required accuracy rate. Redundant connections of the network are then removed by a network pruning algorithm. The activation values of the hidden units in the network are analyzed, and classification rules are generated using the result of this analysis. The effectiveness of the proposed approach is clearly demonstrated by the experimental results on a set of standard data mining test problems.", keywords = "data mining, neural networks, rule extraction, network pruning, classification", } @Article{lu.ea:neurorule-connectionist:95, author = "H. Lu and R. Setiono and H. Liu", journal = "Proc. of VLDB95", title = "{N}euro{R}ule: {A} Connectionist Approach to Data Mining", year = "1995", URL = "http://www.iscs.nus.sg/~liuh/vldb95.ps", keywords = "Neural Networks, Data Mining, Classification Rules", } @Article{major.ea:efd-hybrid:92, author = "J. A. Major and D. R. Riedinger", address = "Travelers Insurance Co, Hartford, Ct, 06183", title = "Efd - a hybrid knowledge statistical-based system for the detection of fraud", crossref = "ijis-special-issue:92", journal = "International J. Of Intelligent Systems", year = "1992", volume = "7", issue = "7", pages = "687--703", abstract = "EFD (Electronic Fraud Detection) assists Investigative Consultants in the Managed Care \& Employee Benefits Security Unit of The Travelers Insurance Companies in the detection and preinvestigative analysis of healthcare provider fraud. The task EFD performs, scanning a large population of health insurance claims in search of likely fraud, has never been done manually. Furthermore, the available database has few positive examples. Thus, neither existing knowledge engineering techniques nor statistical methods are sufficient for designing the identification process. To overcome these problems, EFD uses knowledge discovery techniques on two levels. First, EFD integrates expert knowledge with statistical information assessment to identify cases of unusual provider behavior. The heart of EFD is 27 behavioral heuristics, knowledge-based ways of viewing and measuring provider behavior. Rules operate on them to identify providers whose behavior merits a closer look by the Investigative Consultants. Second, machine learning is used to develop new rules and improve the identification process. Pilot operations involved analysis of nearly 22 000 providers in six metropolitan areas. The pilot is implemented in SAS Institute's SAS(R) System, AICorp's Knowledge Base Management System (KBMS(R)), and Borland International's Turbo Prolog(R).", keywords = "Statistics, Frontiers, Finance, natural language reports", } @Article{mallen:cupid-iterative:, URL = "http://osiris.sis.port.ac.uk/technical_reports_index/kdpap.html", title = "{CUPID} - An Iterative Knowledge Discovery Framework", note = "Presented at ES94 (12/10/94)", author = "Max Bramer Jason Mallen", address = "University of Portsmouth, UK", abstract = "This paper describes the novel Knowledge Discovery system CUPID. Knowledge Discovery from Databases (KDD) is concerned with utilising techniques borrowed from fields such as machine learning (ML), statistics and databases to search for relationships and global patterns that may exist in large databases, but are `hidden' among the vast amounts of data. The discovered knowledge can be helpful for building knowledge based systems and data analysis. The underlying principle behind CUPID is the use of a quantitative measure for the `interest' of a hypotheses. This measure provides a method of ranking competing hypotheses and thus allows the system to store the 'best' or 'most interesting' rules describing a database. CUPID is based on the ITRule algorithm of (Smyth \& Goodman, 1992) and extends that algorithm with added functionality. CUPID provides four fundamental features. One, background knowledge in the form of attribute value generalisation hierarchies may be utilised. Two, prior domain knowledge which may be incorrect and incomplete may be provided by a domain expert. Three, knowledge may be re-used. Four, noise in the data set is handled in a well founded manner.", } @InCollection{manago.ea:induction-decision:91, crossref = "piatetsky-shapiro.ea:knowledge-discovery:91", editor = "Gregory Piatetsky-Shapiro and William J. Frawley", booktitle = "Knowledge Discovery in Databases", publisher = "AAAI Press / The MIT Press", address = "Menlo Park, California", edition = "1st", year = "1991", author = "Michel Manago and Yves Kodratoff", title = "Induction of Decision trees from Complex Structured Data", pages = "289--306", } @Article{mannila.ea:algorithms-inferring:94, author = "H. Mannila and K. J. Raiha", address = "Univ Helsinki, Dept Comp Sci, Pob 26, Sf-90014 Helsinki, Finland Univ Tampere, Dept Comp Sci, Sf-33101 Tampere, Finland", title = "Algorithms for inferring functional-dependencies from relations", journal = "Data \& Knowledge Engineering", year = "1994", volume = "12", issue = "1", pages = "83--99", abstract = "The dependency inference problem is to find a cover of the set of functional dependencies that hold in a given relation. The problem has applications in relational database design, in query optimization, and in artificial intelligence. The problem is exponential in the number of attributes. We develop two algorithms with better best case behavior than the simple one. One algorithm reduces the problem to computing the transversal of a hypergraph. The other is based on repeatedly sorting the relation with respect to a set of attributes.", keywords = "ARMSTRONG RELATIONS, DESIGN, FUNCTIONAL DEPENDENCIES, MACHINE DISCOVERY, DATA MINING, ALGORITHMS", } @InProceedings{mannila.ea:discovering-frequent-episodes-in-sequences:95, author = "H. Mannila and H. Toivonen and A. I. Verkamo", title = "{Discovering Frequent Episodes in Sequences}", booktitle = "Proceedings of the First International Conference on Knowledge Discovery and Data Mining (KDD-95)", year = "1995", address = "Montreal, Canada", month = aug, publisher = "AAAI Press", editor = "U. M. Fayyad and R. Uthurusamy", URL = "ftp://ftp.cs.helsinki.fi/pub/Reports/by_Project/PMDM/Finding_Frequent_Episodes_in_Sequences.ps.gz", keywords = "Knowledge discovery, Data mining, Sequence analysis, Episode discovery", abstract = "Sequences of events describing the behavior and actions of users or systems can be collected in several domains. In this paper we consider the problem of recognizing frequent episodes in such sequences of events. An episode is defined to be a collection of events that occur within time intervals of a given size in a given partial order. Once such episodes are known, one can produce rules for describing or predicting the behavior of the sequence. We describe an efficient algorithm for the discovery of all frequent episodes from a given class of episodes, and present experimental results.", } @InProceedings{mannila.ea:discovering-generalized:96, title = "Discovering Generalized Episodes Using Minimal Occurrences", pages = "146", author = "Heikki Mannila and Hannu Toivonen", crossref = "simoudis.ea:proceedings-second:96", } @InProceedings{mannila.ea:efficient-algorithms:94, author = "Heikki Mannila and Hannu Toivonen and A. Inkeri Verkamo", booktitle = "AAAI Workshop on Knowledge Discovery in Databases (KDD-94)", title = "Efficient algorithms for discovering association rules", year = "1994", URL = "ftp://ftp.cs.helsinki.fi/pub/Reports/by_Project/PMDM/Efficient_Algorithms_for_Discovering_Association_Rules.ps.gz", editor = "Usama M. Fayyad and Ramasamy Uthurusamy", address = "Seattle, Washington", publisher = "AAAI Press", keywords = "Knowledge discovery, Data mining, Association rules", month = jul, pages = "181--192", abstract = "Association rules are statements of the form ``for 90 \% of the rows of the relation, if the row has value 1 in the columns in set $W$, then it has 1 also in column $B$''. Agrawal, Imielinski, and Swami introduced the problem of mining association rules from large collections of data, and gave a method based on successive passes over the database. We give an improved algorithm for the problem. The method is based on careful combinatorial analysis of the information obtained in previous passes; this makes it possible to eliminate unnecessary candidate rules. Experiments on a university course enrollment database indicate that the method outperforms the previous one by a factor of 5. We also show that sampling is in general a very efficient way of finding such rules.", } @InProceedings{mannila.ea:multiple-uses:96, title = "Multiple Uses of Frequent Sets and Condensed Representations: Extended Abstract", pages = "189", author = "Heikki Mannila and Hannu Toivonen", crossref = "simoudis.ea:proceedings-second:96", } @InProceedings{mannila:machine-learning:96, author = "Heikki Mannila", title = "Data mining and machine learning (abstract)", booktitle = "Proc. 13th International Conference on Machine Learning", publisher = "Morgan Kaufmann", year = "1996", pages = "555", } @Article{mark:here-we:96, author = "B. Mark", address = "Natl Semicond Architecture Lab, 2900 Semicond Dr, M-S E-100, Santa Clara, Ca, 95052", title = "Data mining - here we go again - guest-editors introduction", journal = "Ieee Expert-Intelligent Systems \& Their Applications", year = "1996", volume = "11", issue = "5", pages = "18--19", } @Article{marks:inference-mls:96, author = "D. G. Marks", address = "Us Dept Def, Off Infosec Comp Sci, Ft George G Meade, Md, 20755", title = "Inference in mls database-systems", journal = "Ieee Trans. On Knowledge And Data Engineering", year = "1996", volume = "8", issue = "1", pages = "46--55", abstract = "Database systems that contain information of varying degrees of sensitivity pose the threat that some of the Low data may infer High data. This study derives conditions sufficient to identify such inference threats. First, it is reasoned that a database can only control material implications, as specified in formal logic systems. These material implications are found using Knowledge Discovery techniques. Material implications allow reasoning about outside knowledge, and provide the first assurance that outside knowledge does not assist in circumventing the inference controls. Database queries specify the properties of sets of data and are compared to help determine inferences. These queries are grouped into equivalence classes based upon their inference characteristics. A unique graph based model is developed for the equivalence classes that 1) makes such comparisons easy, and 2) allows implementation of an algorithm capable of finding those material implication rules where High data is inferred from Low data. This is the first method that offers assurance and sufficiency arguments that the mechanism is at least strong enough to protect the High data in the database from inference attacks that require Low data.", keywords = "INFERENCE, DATABASE SECURITY, KNOWLEDGE DISCOVERY, MLS, QUERY PATTERNS", } @InProceedings{masand.ea:comparison-approaches:96, title = "A Comparison of Approaches for Maximizing Business Payoff of Prediction Models", pages = "195", author = "Brij Masand and Gregory Piatetsky-Shapiro", crossref = "simoudis.ea:proceedings-second:96", } @Article{matheus.ea:systems:93, crossref = "cercone.ea:ieee-transactions:93", author = "C. J. Matheus and P. K. Chan and G. Piatetsky-Shapiro", address = "Gte Labs Inc, Tech Staff, 40 Sylvan Rd, Waltham, Ma, 02254 Gte Labs Inc, Knowledge Discovery Databases Project, Waltham, Ma, 02254 Columbia Univ, Dept Comp Sci, New York, Ny, 10027", title = "Systems for knowledge discovery in databases", journal = "Ieee Trans. On Knowledge And Data Engineering", year = "1993", month = dec, volume = "5", issue = "6", pages = "903--913", abstract = "The automated discovery of knowledge in databases is becoming increasingly important as the world's wealth of data continues to grow exponentially, Knowledge-discovery systems face challenging problems from real-world databases which tend to be dynamic, incomplete, redundant, noisy, sparse, and very large. This paper addresses these problems and describes some techniques for handling them. A model of an idealized knowledge-discovery system is presented as a reference for studying and designing new systems. This model is used in the comparison of three systems: CoverStory, EXPLORA, and the Knowledge Discovery Workbench. The deficiencies of existing systems relative to the model reveal several open problems for future research.", annote = "Discusses Coverstory, Explora and KDW", keywords = "DATABASES, DISCOVERY, KDD SYSTEMS, MACHINE LEARNING", } @Article{mcaleer.ea:con-econometrics:95, author = "M. McAleer and M. R. Veall", address = "Univ Western Australia, Dept Econ, Nedlands, Wa 6009, Australia Mcmaster Univ, Dept Econ, Hamilton, On, Canada", title = "Data mining and the con in econometrics - the us demand for money revisited", journal = "Mathematics And Computers In Simulation", year = "1995", volume = "39", issue = "3-4", pages = "329--333", } @InProceedings{mehta.ea:mdl-based-decision:95, author = "Manish Mehta and Jorma Rissanen and Rakesh Agrawal", booktitle = "Proceedings of the First International Conference on Knowledge Discovery and Data Mining (KDD'95)", title = "{MDL}-Based Decision Tree Pruning", year = "1995", abstract-url = "http://www.almaden.ibm.com/cs/people/ragrawal/abstracts.html#mra95", URL = "http://www.almaden.ibm.com/cs/people/ragrawal/papers/kdd95_mdl.ps", keywords = "Data Mining, Classification, Decision-Trees, MDL", month = aug, pages = "216--221", abstract = "This paper explores the application of the Minimum Description Length principle for pruning decision trees. We present a new algorithm that intuitively captures the primary goal of reducing the misclassification error. An experimental comparison is presented with three other pruning algorithms. The results show that the MDL pruning algorithm achieves good accuracy, small trees, and fast execution times.", } @Article{mehta.ea:sliq-fast:96, author = "M. Mehta and R. Agrawal and J. Rissanen", title = "{SLIQ}: {A} Fast Scalable Classifier for Data Mining", journal = "Lecture Notes in Computer Science", volume = "1057", pages = "18--??", year = "1996", ISSN = "0302-9743", } @Article{mesrobian.ea:geophysical-data:96, author = "E. Mesrobian and R. Muntz and E. Shek and S. Nittel and M. Larouche and M. Kriguer and C. Mechoso and J. Farrara and P. Stolorz and H. Nakamura", address = "Univ Calif Los Angeles, Dept Comp Sci, Los Angeles, Ca, 90024 Univ Calif Los Angeles, Dept Atmospher Sci, Los Angeles, Ca, 90024 Univ Tokyo, Dept Earth \& Planetary Phys, Tokyo, Japan Univ Calif Los Angeles, Data Min Lab, Los Angeles, Ca, 90024", title = "Mining geophysical-data for knowledge", journal = "Ieee Expert-Intelligent Systems \& Their Applications", year = "1996", volume = "11", issue = "5", pages = "34--44", } @TechReport{michalski.ea:aq15-inductive:86, author = "Ryszard S. Michalski and Igor Mozetic and Jiarong Hong and Nada Lavrac", title = "The {AQ15} inductive learning system: an overview and experiments", institution = "University of Illinois", number = "UIUCDCS-R-86-1260", month = jul, year = "1986", } @Article{michalski.ea:clustering:92, crossref = "shapiro:encyclopedia-artificial:92", key = "Encyclopedia-of-ai:clustering", author = "R. S. Michalski and R. E. Stepp", title = "Clustering", pages = "168--176", } @Article{michalski.ea:inlen-architecture:92, author = "R. S. Michalski and L. Kerschberg and K. A. Kaufman", title = "Mining for knowledge in Databases: The {INLEN} Architecture, Initial Implementation and First Results.", journal = "Journal of Intelligent Information Systems", year = "1992", pages = "85--113", } @InCollection{michalski.ea:learning-observation:83, author = "Ryszard S. Michalski and Robert E. Stepp", title = "Learning from observation: conceptual clustering", booktitle = "Machine Learning, an {Artificial Intelligence} approach", pages = "331--363", crossref = "michalski.ea:machine-learning:83", } @Book{michalski.ea:machine-learning:83, editor = "Ryszard S. Michalski and Jaime G. Carbonell and Tom M. Mitchell", title = "Machine Learning, an {Artificial Intelligence} approach", publisher = "Morgan Kaufmann", year = "1983", volume = "1", address = "San Mateo, California", } @Book{michalski.ea:machine-learning:86, editor = "Ryszard S. Michalski and Jaime G. Carbonell and Tom M. Mitchell", title = "Machine Learning, an {Artificial Intelligence} approach", publisher = "Morgan Kaufmann", year = "1986", volume = "2", address = "San Mateo, California", } @InProceedings{michalski.ea:multi-purpose-incremental:86, author = "Ryszard S. Michalski and Igor Mozetic and Jiarong Hong and Nada Lavrac", title = "The multi-purpose incremental learning system {AQ15} and its testing application to three medical domains", booktitle = "Proceedings of the 5th national conference on Artificial Intelligence", pages = "1041--1045", address = "Philadelphia", year = "1986", } @InCollection{michalski:theory-methodology:83, author = "Ryszard S. Michalski", title = "A theory and methodology of inductive learning", pages = "83--134", crossref = "michalski.ea:machine-learning:83", } @InProceedings{mihalisin.ea:fast-robust:97, title = "Fast Robust Visual Data Mining", author = "Ted Mihalisin and John Timlin", pages = "231", crossref = "heckerman.ea:proceedings-third:97", } @Article{mingers:empirical-comparison:89, author = "J. Mingers", title = "An empirical comparison of selection measures for decision tree induction", publisher = "Kluwer Academic", address = "Boston", journal = "Machine Learning,3", year = "1989", volume = "3 (?)", pages = "319--342", } @InCollection{minsky:framework-representating:75, author = "Marvin Minsky", title = "A framework for representating knowledge", booktitle = "The Psychology of Computer Vision", editor = "Patrick Henry Winston", publisher = "McGraw-Hill", address = "New York", pages = "211--277", year = "1975", } @InCollection{mitchell.ea:learning-by:83, author = "Tom M. Mitchell and Paul E. Utgoff and Ranan Banerji", title = "Learning by experimentation: acquiring and refining problem-solving heuristics", crossref = "michalski.ea:machine-learning:83", pages = "163--190", } @Article{miyano:learning-theory-toward:95, author = "S. Miyano", address = "Kyushu Univ, Fundamental Informat Sci Res Inst, Fukuoka 812, Japan", title = "Learning-theory toward genome informatics", journal = "Ieice Trans. On Information And Systems", year = "1995", volume = "E78D", issue = "5", pages = "560--567", abstract = "This paper discusses some problems in Molecular Biology for which learning paradigms are strongly desired. We also present a framework of knowledge discovery by PAC-learning paradigm together with its theory and practice developed in our work for discovery from amino acid sequences.", keywords = "PAC-LEARNING, COMPUTATIONAL COMPLEXITY, KNOWLEDGE ACQUISITION, GENOME INFORMAICS", } @Article{mollestad.ea:rough-set:96, author = "T. Mollestad and A. Skowron", title = "A rough set framework for data mining of propositional default rules", journal = "Lecture Notes in Computer Science", volume = "1079", pages = "448--??", year = "1996", ISSN = "0302-9743", } @InProceedings{monge.ea:field-matching:96, title = "The Field Matching Problem: Algorithms and Applications", pages = "267", author = "Alvaro E. Monge and Charles P. Elkan", crossref = "simoudis.ea:proceedings-second:96", } @TechReport{mooney:encouraging-experimental:92, author = "Raymond J. Mooney", title = "Encouraging experimental results on learning {CNF}", institution = "University of Texas", month = oct, year = "1992", } @InProceedings{morik.ea:multistrategy-approach:96, author = "K. Morik and P. Brockhausen", booktitle = "Proceedings of the 3nd International Workshop on Multistrategy Learning", publisher = "AAAI Press", title = "A Multistrategy Approach to Relational Knowledge Discovery in Databases", pages = "17--28", year = "1996", } @InProceedings{morik:applications-machine:92, author = "Katharina Morik", title = "Applications of Machine Learning", booktitle = "Proc.\ 6th European Knowledge Acquisition Workshop", year = "1992", publisher = "Springer-Verlag, Berlin", pages = "9--13", annote = "Brief overview of ML applications, Future directions of ML - Intergration of ML into databases (data mining), Multi-strategy learning, inductive logic programming.", } @Article{morrell:using-probabilistic:97, author = "J. A. Morrell", address = "Monsanto Co, St Louis, Mo, 63198", title = "Data mining using probabilistic structure analysis", journal = "Abstracts Of Papers Of The American Chemical Soc.", year = "1997", volume = "213", issue = "Pt1", pages = "69--CINF", } @Proceedings{morrison.ea:advances-14th:96, editor = "R. (Ronald) Morrison and Jessie Kennedy", booktitle = "Advances in databases: 14th British National Conference on Databases, {BNCOD} 14, Edinburgh, Scotland, United Kingdom, July 3--5, 1996: proceedings", title = "Advances in databases: 14th British National Conference on Databases, {BNCOD} 14, Edinburgh, Scotland, United Kingdom, July 3--5, 1996: proceedings", volume = "1094", publisher = "Springer-Verlag Inc.", address = "New York, NY, USA", pages = "xi + 229", year = "1996", ISBN = "3-540-61442-7 (paperback)", ISSN = "0302-9743", LCCN = "QA76.9.D3 B75 1996", series = "Lecture Notes in Computer Science", annote = "Schema integration meta-knowledge classification and reuse / R. M. Duwairi, N. J. Fiddian, W. A. Gray -- View mechanism for schema evolution in object-oriented DBMS / Zohra Bellahsene -- An active rule language for ROCK and ROLL / Andrew Dinn \ldots{} [et al.] -- Integrity constraints in multiversion databases / Anne Doucet \ldots{} [et al.] -- The development of a semantic integrity constraint subsystem for a distributed database / H. Ibrahim, W. A. Gray, N. J. Fiddian -- Understanding the tension between transition rules and confidentiality / Xavier C. Delannoy -- extending ER for dynamic behaviour and refinement / Simon Wiseman, Bryony Pomeroy -- Speeding up knowledge discovery in large relational databases by means of a new discretization algorithm / Alex Alves Freitas, Simon H. Lavington -- Integration of load measurement parameters into the cost evaluation of database queries / Guntram Flach, Holger Meyer -- High performance OO traversals in Monet / Peter A. Boncz, Fred Kwakkel, Martin L. Kersten -- A modular compiler architecture for a data manipulation language / Suzanne M. Embury, Peter M. D. Gray -- Querying graph databases using a functional language extended with second order facilities / Robert Ayres, Peter J. H. King -- SQL+i: adding temporal indeterminacy to the database language SQL / Antony Griffiths, Babis Theodoulidis -- Pearls, swines and sows' ears: interface research inside a multinational bank / Matthew Chalmers -- Dissemination-based information systems: your data may be where you least expect it / Stanley B. Zdonik -- Microsoft database technologies: an inside view / Nigel Stanley - - Predicate maintained queries: an active OODBMS for financial applications / Mark Butterfield, Nicholas Caine, Stephen Ross- Talbot -- Universal data management / A. Bailey.", keywords = "Database management -- Congresses.", } @Article{mott:case-based-reasoning:93, author = "S. Mott", address = "Cognit Syst Inc, 234 Church St, New Haven, Ct, 06903", title = "Case-based reasoning - market, applications, and fit with other technologies", journal = "Expert Systems With Applications", year = "1993", volume = "6", issue = "1", pages = "97--104", abstract = "Case-based reasoning (CBR), the hit of the American Association of Artificial Intelligence annual conference in 1991 and 1992 is now enjoying a surge of interest in its first year of commercial availability. Knowledge-based system designers, developers, integrators, and tool vendors are now seriously considering the role and utility of CBR in leveraging the vast experience within organizations for more effective decision making. The potential market for CBR appears enormous, particularly in more complex problem- solving domains, but the areas of most immediate interest are in applications where efficient information processing needs are urgent, such as automated help desks. Early experiments pairing CBR with rule-based systems will soon lead to hybrid combinations with other ''close approximation'' technologies, such as neural networks, fuzzy logic systems, genetic algorithms, and so forth. CBR appears headed for a sustaining role not only as a useful complement in knowledge-based information processing technology but also as an engine for ''mainstream '' information tasks of the future (e.g., intelligent text processing and retrieval, data mining, and projective reasoning). This article will discuss this emerging role for CBR and its implications from a marketing perspective.", } @TechReport{mueller:fast-sequential:95, author = "Andreas Mueller", title = "Fast Sequential and Parallel Algorithms for Association Rule Mining: {A} Comparison", institution = "Dept. of Computer Science, Univ. of Maryland", number = "CS-TR-3515", address = "College Park, MD", month = aug, year = "1995", URL = "ftp://ftp.cs.umd.edu/pub/papers/papers/3515/3515.ps.Z", abstract = "The field of knowledge discovery in databases, or _Data Mining_, has received increasing attention during recent years as large organizations have begun to realize the potential value of the information that is stored implicitly in their databases. One specific data mining task is the mining of Association Rules, particularly from retail data. The task is to determine patterns (or rules) that characterize the shopping behavior of customers from a large database of previous consumer transactions. The rules can then be used to focus marketing efforts such as product placement and sales promotions.\par Because early algorithms required an unpredictably large number of IO operations, reducing IO cost has been the primary target of the algorithms presented in the literature. One of the most recent proposed algorithms, called PARTITION, uses a new TID-list data representation and a new partitioning technique. The partitioning technique reduces IO cost to a constant amount by processing one database portion at a time in memory. We implemented an algorithm called SPTID that incorporates both TID-lists and partitioning to study their benefits. For comparison, a non-partitioning algorithm called SEAR, which is based on a new prefix-tree data structure, is used. Our experiments with SPTID and SEAR indicate that TID-lists have inherent inefficiencies; furthermore, because all of the algorithms tested tend to be CPU-boundn trading CPU-overhead against I/O operations by partitioning did not lead to better performance.\par In order to scale mining algorithms to the huge databases (e.g., multiple Terabytes) that large organizations will manage in the near future, we implemented parallel versions of SEAR and SPEAR (its partitioned counterpart). The performance results show that, while both algorithms parallelize easily and obtain good speedup and scale-up results, the parallel SEAR version performs better than parallel SPEAR, despite the fact that it uses more communication.", } @Book{muller.ea:neural-networks:91, author = "Berndt M{\"u}ller and Joachim Reinhardt", title = "Neural Networks, an introduction", publisher = "Springer-Verlag", address = "Berlin", series = "Physics of Neural Networks", year = "1991", } @Article{murphy.ea:exploring-decision:, URL = "gopher://P.GP.CS.CMU.EDU:70/00/volume1/murphy94a.ps", title = "Exploring the Decision Forest: An Empirical Investigation of Occam's Razor in Decision Tree Induction", author = "Patrick M. Murphy and Michael J. Pazzani", abstract = "We report on a series of experiments in which all decision trees consistent with the training data are constructed. These experiments were run to gain an understanding of the properties of the set of consistent decision trees and the factors that affect the accuracy of individual trees. In particular, we investigated the relationship between the size of a decision tree consistent with some training data and the accuracy of the tree on test data. The experiments were performed on a massively parallel Maspar computer. The results of the experiments on several artificial and two real world problems indicate that, for many of the problems investigated, smaller consistent decision trees are on average less accurate than the average accuracy of slightly larger trees.", } @InProceedings{murthy.ea:oc1-randomized:93, author = "S. K. Murthy and S. Kasif and S. Salzberg and R. Beigel", title = "{OC1}: Randomized Induction of Oblique Decision Trees", booktitle = "Proceedings of the Eleventh National Conference on Artificial Intelligence", pages = "322--327", address = "Washington, D.C.", year = "1993", } @PhdThesis{murthy:on-growing:, title = "On Growing Better Decision Trees from Data", author = "Sreerama K. Murthy", abstract = "This thesis investigates the problem of growing decision trees from data, for the purposes of classification and prediction. After a comprehensive, multi-disciplinary survey of work on decision trees, some algorithmic extensions to existing tree growing methods are considered. The implications of using (1) less greedy search and (2) less restricted splits at tree nodes are systematically studied. Extending the traditional axis-parallel splits to {\it oblique} splits is shown to be practical and beneficial for a variety of problems. However, the use of more extensive search heuristics than the traditional greedy heuristic is argued to be unnecessary, and often harmful. Any effort to build good decision trees from real-world data involves ``massaging'' the data into a suitable form. Two forms of data massaging, domain-independent and domain-specific, are distinguished in this work. A new framework is outlined for the former, and the importance of the latter is illustrated in the context of two new, complex classification problems in astronomy. Highly accurate and small decision tree classifiers are built for both these problems through a collaborative effort with astronomers.", annote = "For individual chapters, see ftp://blaze.cs.jhu.edu/pub/murthy", URL = "http://www.cs.jhu.edu/grad/murthy", } @PhdThesis{musick:belief-network:, URL = "http://http.cs.berkeley.edu/~musick", title = "Belief Network Induction", author = "Ron Musick", school = "University of California, Berkeley", abstract = "This dissertation describes BNI (Belief Network Inductor), a tool that automatically induces a belief network from a database. The fundamental thrust of this research program has been to provide a theoretically sound method of inducing a model from data, and performing inference over that model. Along with a solid grounding in probability theory, BNI has proven to be a quick, practical method of inducing data models that are highly accurate. The results include a belief network that stores beta distributions in the conditional probability tables, coupled with theorems demonstrating how to maintain these distributions through inference; techniques for applying neural network and other learning techniques to the task of conditional probability table learning; and a decision theoretic sampling theory which addresses scalability issues by characterizing the size of the sample needed to produce high quality inferences. The setting for this work is in database mining.", } @TechReport{musick:belief-network:95, author = "Ron Musick", title = "Belief Network Induction", institution = "EECS Computer Science Division, University of California, Berkeley", type = "Technical Report", number = "UCB//CSD-95-863", pages = "104", month = dec, year = "1995", URL = "ftp://tr-ftp.cs.berkeley.edu/pub/tech-reports/csd/csd-95-863/all.ps", abstract = "This dissertation describes BNI (Belief Network Inductor), a tool that automatically induces a belief network from a database. The fundamental thrust of this research program has been to provide a theoretically sound method of inducing a model from data, and performing inference over that model. Along with a solid grounding in probability theory, BNI has proven to be a quick, practical method of inducing data models that are highly accurate. The results include a belief network that stores beta distributions in the conditional probability tables, coupled with theorems demonstrating how to maintain these distributions through inference; techniques for applying neural network and other learning techniques to the task of conditional probability table learning; and a decision theoretic sampling theory which addresses scalability issues by characterizing the size of the sample needed to produce high quality inferences. The setting for this work is in database mining. Database mining is one of the fastest growing topics in Artificial Intelligence today, with industry providing at least as much impetus as research labs and universities. The general goal is to extract interesting quantities or relationships that are ``hidden'' in large corporate or scientific databases, with the potential benefits of a successful technology being enormous. For example, models can be built that characterize what types of customers will respond to what types of marketing schemes, retailers will be able to predict sales to help determine correct inventory levels and distribution schedules, and insurance companies will be able to predict expected claim costs and better classify who will buy what type of coverage.", } @InProceedings{musick:rethinking-learning:96, title = "Rethinking the Learning of Belief Network Probabilities", pages = "120", author = "Ron Musick", crossref = "simoudis.ea:proceedings-second:96", } @InProceedings{nakhaeizadeh.ea:development-multi-criteria:97, title = "Development of Multi-Criteria Metrics for Evaluation of Data Mining Algorithms", author = "Gholamreza Nakhaeizadeh and Alexander Schnabl", pages = "37", crossref = "heckerman.ea:proceedings-third:97", } @Book{naqvi.ea:logical-language:89, author = "Shamim Naqvi and Shalom Tsur", title = "A logical language for data and knowledge bases", publisher = "Computer Science Press", year = "1989", series = "Principles of computer science", address = "Rockville", } @Article{narayanan:revisable:96, author = "A. Narayanan", address = "Univ Exeter, Dept Comp Sci, Exeter Ex4 4Pt, Devon, England", title = "Revisable knowledge discovery in databases", journal = "International J. Of Intelligent Systems", year = "1996", volume = "11", issue = "2", pages = "75--96", abstract = "This article introduces the idea of using nonmonotonic inheritance networks for the storage and maintenance of knowledge discovered in data (revisable knowledge discovery in databases). While existing data mining strategies for knowledge discovery in databases typically involve initial structuring through the use of identification trees and the subsequent extraction of rules from these trees for use in rule-based expert systems, such strategies have difficulty in coping with additional information which may conflict with that already used for the automatic generation of rules. In the worst case, the entire automatic sequence may have to be repeated. If nonmonotonic inheritance networks are used instead of rules for storing knowledge discovered in databases, additional conflicting information can be inserted directly into such structures, thereby bypassing the need for recompilation. (C) 1996 John Wiley \& Sons, Inc.", keywords = "LOGIC, CIRCUMSCRIPTION, INHERITANCE", } @InProceedings{nassersharif.ea:high-performance-computing:95, author = "Bahram Nassersharif and Richard Marciano and Sui-ky Ling and Eugene Ho and Curt Edmonds", title = "High-Performance Computing Approaches for Using the {WWW} to Access a Large-Scale Environmental Dataset Repository", booktitle = "Proceedings of Supercomputing'95", publisher = "ACM/IEEE", address = "San Diego, CA", month = dec, year = "1995", keywords = "data mining, mass storage, supercomputing, World Wide Web, Oracle, data repositories, access tools, remote sensing, environmental data, global warming, NALC, ecology,", abstract = "Simple html document on CD with MPEG.", } @InProceedings{ng.ea:efficient-and-effective-clustering-methods-for-spatial-data-mining:94, author = "Raymond T. Ng and Jiawei Han", title = "{Efficient and Effective Clustering Methods for Spatial Data Mining}", booktitle = "Proceedings of the Twentieth International Conference on Very Large Databases", year = "1994", address = "Santiago, Chile", pages = "144--155", abstract = "Spatial data mining is the discovery of interesting relationships and characteristics that may exist implicitly in spatial databases. In this paper, we explore whether clustering methods have a role to play in spatial data mining. To this end, we develop a new clustering method called CLARANS which is based on randomized search. We also develop two spatial data mining algorithms that use CLARANS. Our analysis and experiments show that with the assistance of CLARANS, these two algorithms are very effective and can lead to discoveries that are difficult to find with current spatial data mining algorithms. Furthermore, experiments conducted to compare the performance of CLARANS with that of existing clustering methods show that CLARANS is the most efficient.", } @InProceedings{ng.ea:efficient-effective:94, author = "R. T. Ng and J. Han", title = "Efficient and Effective Clustering Methods for Spatial Data Mining", editor = "Jorgeesh Bocca and Matthias Jarke and Carlo Zaniolo", booktitle = "20th International Conference on Very Large Data Bases, September 12--15, 1994, Santiago, Chile proceedings", publisher = "Morgan Kaufmann Publishers", address = "Los Altos, CA 94022, USA", pages = "144--155", year = "1994", annote = "Also known as VLDB'94", keywords = "very large data bases; VLDB", } @Book{nilsson:principles-artificial-intelligence:82, author = "Nils J. Nilsson", title = "Principles of {Artificial Intelligence}", publisher = "Springer-Verlag", year = "1982", series = "Symbolic Computation", } @Article{oleary:some-privacy:95, author = "D. E. Oleary", address = "Univ So Calif, 3660 Trousdale Pkwy, Los Angeles, Ca, 90089", title = "Some privacy issues in knowledge discovery - the oecd personal privacy guidelines", journal = "Ieee Expert-Intelligent Systems \& Their Applications", year = "1995", volume = "10", issue = "2", pages = "48--52", keywords = "ethics, privacy, ethical issues", } @Article{owrang.ea:using-domain:96, author = "M. M. Owrang and F. H. Grupe", address = "American Univ, Dept Comp Sci, 400 Massachussets Ave Nw, Washington, Dc, 20016 Univ Nevada, Dept Accounting \& Comp Informat Syst, Reno, Nv, 89557", title = "Using domain knowledge to guide database knowledge discovery", journal = "Expert Systems With Applications", year = "1996", volume = "10", issue = "2", pages = "173--180", abstract = "Modern database technologies process large volumes of data to discover new knowledge. Some large databases make discovery computationally expensive. Additional knowledge, known as domain or background knowledge, hidden in the database can often guide and restrict the search for interesting knowledge. This paper discusses mechanisms by which domain knowledge can be used effectively in discovering knowledge from databases. In particular we look at the use of domain knowledge to reduce the search as well as to optimize the hypotheses which represent the interesting knowledge to be discovered. Also, we discuss how to use domain knowledge to test the validity of the discovered knowledge. Although domain knowledge can be used to improve database searches, it should not block the discovery of unexpected knowledge. We provide some guidelines to use domain knowledge properly.", } @InProceedings{padmanabhan.ea:pattern-temporal:96, title = "Pattern Discovery in Temporal Databases: {A} Temporal Logic Approach", pages = "351", author = "Balaji Padmanabhan and Alexander Tuzhilin", crossref = "simoudis.ea:proceedings-second:96", } @Article{park.ea:effective-hash-based:95, author = "Jong Soo Park and Ming-Syan Chen and P. S. Yu", title = "An effective hash-based algorithm for mining association rules", journal = "SIGMOD Record (ACM Special Interest Group on Management of Data)", volume = "24", number = "2", pages = "175--186", month = jun, year = "1995", ISSN = "0163-5808", affiliation = "IBM Thomas J. Watson Res. Center, Yorktown Heights, NY, USA", classification = "C7170 (Marketing computing); C7180 (Retailing and distribution computing); C6170K (Knowledge engineering techniques); C6160 (Database management systems (DBMS))", keywords = "Effective hash-based algorithm; Association rules mining; Sales transactions; Candidate set generation; Performance bottleneck", thesaurus = "Database management systems; Knowledge acquisition; Pattern matching; Retail data processing; Sales management", xxcrossref = "Anonymous:1995:ASI", } @InProceedings{park.ea:effective-hash:95, title = "An Effective Hash Based Algorithm for Mining Association Rules", author = "Jong Soo Park and Ming-Syan Chen and Philip S. Yu", editor = "Michael J. Carey and Donovan A. Schneider", booktitle = "Proceedings of the 1995 {ACM} {SIGMOD} International Conference on Management of Data", address = "San Jose, California", month = "22--25~" # may, year = "1995", pages = "175--186", } @Book{parsaye.ea:intelligent-database:93, author = "K. Parsaye and M. Chignell", title = "Intelligent Database Tools \& Applications", publisher = "John Wiley", year = "1993", } @Article{parsaye:olap-bridging:97, author = "Kamran Parsaye", title = "{OLAP} and Data Mining: Bridging the Gap", journal = "Database Programming and Design", year = "1997", issue = "February", URL = "http://www.dbpd.com/parsfeb.htm", annote = "OLAP and data mining--while very different--are both integral to the decision-support process. By carefully linking them, you can make sure one activity reinforces the other.", } @Article{pavilion:epidemiological:96, author = "G. Pavilion", title = "Knowledge Discovery from Epidemiological Databases", journal = "Lecture Notes in Computer Science", volume = "1057", pages = "201--??", year = "1996", ISSN = "0302-9743", } @InProceedings{pazzani.ea:beyond-concise:97, title = "Beyond Concise and Colorful: Learning Intelligible Rules", author = "Michael J. Pazzani and Subramani Mani and W. Rodman Shankle", pages = "235", crossref = "heckerman.ea:proceedings-third:97", } @Article{pedrycz:conditional-fuzzy:96, author = "W. Pedrycz", address = "Univ Manitoba, Dept Elect \& Comp Engn, Winnipeg, Mb R3T 5V6, Canada", title = "Conditional fuzzy c-means", journal = "Pattern Recognition Letters", year = "1996", volume = "17", issue = "6", pages = "625--631", abstract = "A Fuzzy C-Means-based clustering method guided by an auxiliary (conditional) variable is introduced. The method reveals a structure within a family of patterns by considering their vicinity in a feature space along with the similarity of the values assumed by a certain conditional variable. The usefulness of the algorithm is exemplified in the problems of data mining.", keywords = "FUZZY CLUSTERING, FUZZY C-MEANS, CONDITIONAL VARIABLE, DATA MINING, RADIAL BASIS FUNCTIONS", } @Article{perna:leveraging-information:95, author = "J. Perna", title = "Leveraging the information asset", journal = "SIGMOD Record (ACM Special Interest Group on Management of Data)", volume = "24", number = "2", pages = "451--452", month = jun, year = "1995", ISSN = "0163-5808", affiliation = "IBM Canada Ltd., Toronto, Ont., Canada", classification = "C7100 (Business and administration); C6160 (Database management systems (DBMS)); C7250L (Non-bibliographic retrieval systems); C7250R (Information retrieval techniques)", keywords = "Information asset; Corporate asset; Database users; Competitive advantage; Unwanted store inventory; Capital outlay; Store sales data; Real time access; Production point of sale information; Database mining; Multivendor database connectivity; Heterogeneous clients; Customer needs; Marketplace", thesaurus = "Business data processing; Database management systems; Information retrieval; Real-time systems", xxcrossref = "Anonymous:1995:ASI", } @InProceedings{pfitzner.ea:parallel-halo:96, title = "Parallel Halo Finding in {N}-body Cosmology Simulations", pages = "26", author = "David W. Pfitzner and John K. Salmon", crossref = "simoudis.ea:proceedings-second:96", } @Unpublished{piatetsky-shapiro.ea:kdd-93-progress:, title = "{KDD}-93: Progress and Challenges in Knowlege Discovery in Databases", author = "G. Piatetsky-Shapiro and C. Matheus and P. Smyth and R. Uthurusamy", URL = "http://info.gte.com/~kdd/kdd-93-report.tex", keywords = "Knowledge, Discovery, Databases", annote = "(a long report on AAAI-93 KDD Workshop), to be published in AI Magazine.", } @Article{piatetsky-shapiro.ea:kdd-93-progress:94, author = "G. Piatetsky-Shapiro and C. Matheus and P. Smyth and R. Uthurusamy", address = "Gte Labs Inc, Knowledge Discovery Databases Project, Waltham, Ma, 02254 Jet Prop Lab, Tech Grp, Pasadena, Ca, 91109 Gm Corp, Res Labs, Detroit, Mi, 48202", title = "Kdd-93 - progress and challenges in knowledge discovery in databases", journal = "Ai Magazine", year = "1994", volume = "15", issue = "3", pages = "77--82", abstract = "Over 60 researchers from 10 countries took part in the Third Knowledge Discovery in Databases (KDD) Workshop, held during the Eleventh National Conference on Artificial Intelligence in Washington, D.C. A major trend evident at the workshop was the transition to applications in the core KDD area of discovery of relatively simple patterns in relational databases; the most successful applications are appearing in the areas of greatest need, where the databases are so large that manual analysis is impossible. Progress has been facilitated by the availability of commercial KDD tools for both generic discovery and domain-specific applications such as marketing. At the same time, progress has been slowed by problems such as lack of statistical rigor, overabundance of patterns, and poor integration. Besides applications, the main themes of this workshop were (1) the discovery of dependencies and models and (2) integrated and interactive KDD systems.", } @Book{piatetsky-shapiro.ea:knowledge-discovery:91, editor = "Gregory Piatetsky-Shapiro and William Frawley", title = "Knowledge Discovery in Databases", publisher = "The MIT Press", address = "Cambridge, MA", pages = "xii + 525", year = "1991", ISBN = "0-262-66070-9 (paper)", LCCN = "Q325.5 .K68 1991", } @InProceedings{piatetsky-shapiro.ea:overview-issues:96, title = "An Overview of Issues in Developing Industrial Data Mining and Knowledge Discovery Applications", pages = "89", author = "Gregory Piatetsky-Shapiro and Ron Brachman and Tom Khabaza and Willi Kloesgen and Evangelos Simoudis", crossref = "simoudis.ea:proceedings-second:96", } @Article{piatetsky-shapiro.ea:workbench-exploring:92, author = "G. Piatetsky-Shapiro and C. J. Matheus", address = "Gte Labs Inc, Waltham, Ma, 02254", title = "Knowledge discovery workbench for exploring business databases", journal = "International J. Of Intelligent Systems", year = "1992", volume = "7", issue = "7", pages = "675--686", abstract = "We describe the Knowledge Discovery Workbench, an interactive system for database exploration. We then illustrate KDW capabilities in data clustering, summarization, classification, and discovery of changes. We also examine extracting dependencies from data and using them to order the multitude of data patterns.", } @InCollection{piatetsky-shapiro:analysis-presentation:91, editor = "Gregory Piatetsky-Shapiro and William J. Frawley", booktitle = "Knowledge Discovery in Databases", publisher = "AAAI Press / The MIT Press", address = "Menlo Park, California", edition = "1st", year = "1991", author = "Gregory Piatetsky-Shapiro", title = "Discovery, Analysis and Presentation of Strong Rules", pages = "229--248", } @InProceedings{piatetsky-shapiro:analysis-strong:89, author = "G Piatetsky-Shapiro", title = "Discovery and Analysis of Strong Rules in Databases", booktitle = "Advanced Database System Symposium, Kyoto", year = "1989", month = dec, } @Article{piatetsky-shapiro:business:96, author = "G. Piatetsky-Shapiro", title = "Data mining and knowledge discovery in business databases", journal = "Lecture Notes in Computer Science", volume = "1079", pages = "56--??", year = "1996", ISSN = "0302-9743", } @Article{piatetsky-shapiro:introduction:92, crossref = "ijis-special-issue:92", year = "1992", author = "Gregory Piatetsky-Shapiro", title = "Introduction", pages = "587--589", annote = "Definition of KDD. Scientific Discovery, Commercial Discovery, overview of papers.", } @TechReport{piatetsky-shapiro:kdd-93-proceedings:93, key = "piatetsky-shapiro:kdd-93-proceedings:93", editor = "G. Piatetsky-Shapiro", title = "{KDD}-93: Proceedings of {AAAI}-93 Knowledge Discovery in Databases workshop", institution = "AAAI", number = "WS-02", month = jul, year = "1993", note = "AAAI Press technical report", } @Misc{piatetsky-shapiro:kdd-frequently:94, key = "piatetsky-shapiro:kdd-frequently:94", title = "{KDD} Frequently Asked Questions", editor = "Gregory Piatetsky-Shapiro", howpublished = "Published via WWW http://info.gte.com/~kdd/FAQ.txt", month = "18th " # apr, year = "1994", } @Misc{piatetsky-shapiro:kdd-nugget:94-7, title = "{KDD} Nugget 94-7", key = "piatetsky-shapiro:kdd-nugget:94-7", editor = "Gregory Piatetsky-Shapiro", howpublished = "Mailing list", month = "18th " # apr, year = "1994", annote = "Contents: * G. Piatetsky-Shapiro, Time: Attack of the Data Miners Business Week: Gold Mine of Data in Customer Service ComputerWorld: Data is money, but people are special US Census Bureau is now on WWW at http://www.census.gov/ * Tej Anand, AT&T Data Mining Conference * Larry Ai, TRW Smart Charts for Pharmaceuticals * Edwin Pednault, MDL workshop at ML/COLT 94 * Roberto Zicari, CFP: Theory and Practice of Object Systems", } @Article{piatetsky-shapiro:knowledge-discovery:91b, key_modifier = "b", author = "G. Piatetsky-Shapiro", address = "Gte Labs Inc, Waltham, Ma, 02254", title = "Knowledge discovery in databases", journal = "Ieee Expert-Intelligent Systems \& Their Applications", year = "1991", month = oct, volume = "6", issue = "5", pages = "74--76", note = "Discussion of second AAAI workshop on KDD", } @Article{piatetsky-shapiro:personal-vs:95, author = "G. Piatetsky-Shapiro", address = "Gte Labs Inc, Waltham, Ma, 02254", title = "Knowledge discovery in personal data vs privacy - a minisymposium", journal = "Ieee Expert-Intelligent Systems \& Their Applications", year = "1995", volume = "10", issue = "2", pages = "46--47", } @Article{piatetsky-shapiro:progress-report:94, author = "G. Piatetsky-Shapiro", address = "Gte Labs Inc, 40 Sylvan Rd, Waltham, Ma, 01254", title = "Knowledge discovery in databases - progress report", journal = "Knowledge Engineering Review", year = "1994", volume = "9", issue = "1", pages = "57--60", } @Article{piatetsky-shapiro:real-report:91a, key_modifier = "a", author = "Gregory Piatetsky-Shapiro", title = "Knowledge Discovery in Real Databases: {A} Report on the {IJCAI}-89 Workshop", journal = "AI Magazine", pages = "68--70", volume = "11", number = "5", month = jan, year = "1991", } @Article{piatetsky-shapiro:special-issue:92, author = "G. Piatetsky-Shapiro", address = "Gte Labs Inc, 40 Sylvan Rd, Waltham, Ma, 02254", title = "Special issue - knowledge discovery in data-bases and knowledge bases - introduction", journal = "International J. Of Intelligent Systems", year = "1992", volume = "7", issue = "7", pages = "587--589", } @InProceedings{pompe.ea:application-ilp:96, author = "U. Pompe and I. Kononenko and T. Mak\v{s}e", title = "An application of {ILP} in a musical database: {L}earning to compose the two-voice counterpoint", booktitle = "Proceedings of the MLnet Familiarization Workshop on Data Mining with Inductive Logic Programing", pages = "1--11", year = "1996", } @Article{price:starlight-star:95, author = "D. Price", address = "Univ Nevada, Reno, Nv, 89557 American Univ, Washington, Dc, 20016", title = "Starlight, star bright - data-mining the cosmos", journal = "Ieee Expert-Intelligent Systems \& Their Applications", year = "1995", volume = "10", issue = "4", pages = "10--13", } @Article{project:comparative-benchmarking:, author = "ELENA project", title = "Comparative benchmarking studies of various algorithms", annote = "Elena project is at: http://www.dice.ucl.ac.be/neural-nets/ELENA/ELENA.html", URL = "ftp://ftp.dice.ucl.ac.be/pub/neural-nets/ELENA/databases/Benchmarks.ps.Z", } @InProceedings{provan.ea:model-simplicity:96, title = "Data Mining and Model Simplicity: {A} Case Study in Diagnosis", pages = "57", author = "Gregory M. Provan and Moninder Singh", crossref = "simoudis.ea:proceedings-second:96", } @InProceedings{provost.ea:analysis-classifier:97, title = "Analysis and Visualization of Classifier Performance: Comparison under Imprecise Class and Cost Distributions", author = "Foster Provost and Tom Fawcett", pages = "43", crossref = "heckerman.ea:proceedings-third:97", abstract = "When mining data with inductive methods, we often experiment with a wide variety of learning algorithms, using different algorithm parameters, varying output threshold values, and using different training regimens. Such experimentation yields a large number of classifiers to be evaluated and compared. In order to compare the performance of classifiers it is necessary to know the conditions under which they will be used; using accuracy alone is inadequate because class distributions and misclassification costs are rarely uniform. Decision-theoretic principles may be used if the class and cost distributions are known exactly. Unfortunately, on real-world problems target cost and class distributions can rarely be specified precisely, and they are often subject to change. For example, in fraud detection we cannot ignore either type of distribution, nor can we assume that our distribution specifications are static or precise. We need a method for the management and comparison of multiple classifiers that is robust to imprecise and changing environments. We introduce the ROC convex hull method, which combines techniques from ROC analysis, decision analysis and computational geometry. The method decouples classifier performance from specific class and cost distributions, and may be used to specify the subset of methods that are potentially optimal under any cost and class distribution assumptions. The ROC convex hull method is efficient, so it facilitates the comparison of a large number of classifiers. It minimizes the management of classifier performance data, because it can specify exactly those classifiers that are potentially optimal, and it is incremental, easily incorporating new and varied classifiers.", URL = "http://www.cs.umass.edu/~fawcett/papers/KDD-97.ps.gz", } @InProceedings{provost.ea:inductive-policy:92, author = "F. J. Provost and B. G. Buchanan", title = "Inductive Policy", year = "1992", booktitle = "Proc.\ of AAAI-92", pages = "255--262", } @InProceedings{provost.ea:scaling-up:97, title = "Scaling Up Inductive Algorithms: An Overview", author = "Foster Provost and Venkateswarlu Kolluri", pages = "239", crossref = "heckerman.ea:proceedings-third:97", } @InCollection{punch.ea:royal-tree:96, author = "William F. Punch and Douglas Zongker and Erik D. Goodman", title = "The Royal Tree Problem, a Benchmark for Single and Multiple Population Genetic Programming", booktitle = "Advances in Genetic Programming 2", publisher = "MIT Press", year = "1996", editor = "Peter J. Angeline and K. E. {Kinnear, Jr.}", pages = "299--316", chapter = "15", address = "Cambridge, MA, USA", keywords = "genetic algorithms, genetic programming", ISBN = "0-262-01158-1", abstract = "We have previously shown how a genetic algorithm (GA) can be used to perform _data mining_, the discovery of particular/important data within large datasets, by finding optimal data classifications using known examples. However, these approaches, while successful, limited data relationships to those that were _fixed_ before the GA run. We report here on an extension of our previous work, substituting a genetic program (GP) for a GA. The GP could optimize data classification, as did the GA, but could also determine the functional relationships among the features. This gave improved performance and new information on important relation ships among features. We discuss the overall approach, and compare the effectiveness of the GA vs. GP on a biochemistry problem, the determination of the involvement of bound water molecules in protein interactions.", note = "Also available as GARAGe96-01-01", size = "18 pages", } @Article{quinlan.ea:foil-midterm:, author = "J. Ross Quinlan and R. M. Cameron-Jones", title = "{FOIL}: {A} Midterm Report", abstract = "FOIL is a learning system that constructs Horn clause programs from examples. This paper summarises the development of FOIL from 1989 up to early 1993 and evaluates its effectiveness on a non-trivial sequence of learning tasks taken from a Prolog programming text. Although many of these are handled reasonably well, the experiment highlights some weaknesses of the current implementation. Areas for further research are identified.", } @Book{quinlan:c4-5:92, author = "J. Ross Quinlan", title = "{C4}.5: Programs for Machine Learning", publisher = "Morgan Kaufmann", year = "1992", } @Unpublished{quinlan:comparing-connectionist:, author = "J. Ross Quinlan", title = "Comparing connectionist and symbolic learning methods", institution = "University of Sydney", } @InProceedings{quinlan:determining-literals:91, author = "J. Ross Quinlan", title = "Determining literals in inductive logic programming", booktitle = "Proceedings of the 12th International Joint Conference on Artificial Intelligence", pages = "746--750", address = "Sydney, Austalia", year = "1991", } @InCollection{quinlan:effect-noise:86, author = "J. Ross Quinlan", title = "The effect of noise on concept learning", crossref = "michalski.ea:machine-learning:86", pages = "149--166", } @InProceedings{quinlan:empirical-comparision:88, author = "J. Ross Quinlan", title = "An empirical comparision of genetic and decision-tree classifiers", booktitle = "Proceedings of the 5th International Conference on Machine Learning", pages = "135--141", address = "Ann Arbor", year = "1988", } @Article{quinlan:induction-decision:86, author = "J. Ross Quinlan", title = "Induction of Decision Trees", journal = "Machine Learning", year = "1986", volume = "1", pages = "81--106", } @InCollection{quinlan:learning-efficient:83, author = "J. Ross Quinlan", title = "Learning efficient classification procedures and their application to chess end games", crossref = "michalski.ea:machine-learning:83", pages = "463--482", } @Article{radcliffe.ea:cooperation-through:, URL = "ftp://ftp.epcc.ed.ac.uk/pub/tr/94/tr9409.ps.Z", title = "Cooperation through Hierarchical Competition in Genetic Data Mining", author = "N J Radcliffe and P D Surry", note = "Parallel Computing Centre,Edinburgh", } @InProceedings{ram:information-sharing:94, author = "Sudha Ram", title = "Information Sharing and Knowledge Discovery in Large Scientific Databases : Introduction", pages = "397--397", editor = "Jay F. Nunamaker and Ralph H. Sprague", booktitle = "Proceedings of the 27th Annual Hawaii International Conference on System Science. Volume 3 : Information Systems: {DSS}/Knowledge-Based Systems", month = jan, publisher = "IEEE Computer Society Press", address = "Los Alamitos, CA, USA", year = "1994", } @InProceedings{ram:information-sharing:95, author = "S. Ram", title = "Information sharing and knowledge discovery in large scientific databases - introduction", pages = "252--252", editor = "Jay F. Nunamaker and Ralph H. Sprague", booktitle = "Proceedings of the 28th Annual Hawaii International Conference on System Sciences. Volume 3: Information Systems - Decision Support and Knowledge-Based Systems", month = jan, publisher = "IEEE Computer Society Press", address = "Los Alamitos, CA, USA", year = "1995", } @Article{rao.ea:knowledge-based-equation:93, author = "R. Bharat Rao and Stephen Y. Lu", title = "A Knowledge-Based Equation Discovery System for Engineering Domains", journal = "IEEE Expert", year = "1993", pages = "37--42", month = aug, annote = "Deals with KEDS system", } @InProceedings{rao.ea:visualizing-bagged:97, title = "Visualizing Bagged Decision Trees", author = "J. Sunil Rao and William J. E. Potts", pages = "243", crossref = "heckerman.ea:proceedings-third:97", } @Article{rau:calculating-salience:93, author = "L. F. Rau", address = "Ge Co, Ctr Res \& Dev, Artificial Intelligence Lab, Schenectady, Ny, 12301", title = "Calculating salience and breadth of knowledge", journal = "Ieee Trans. On Knowledge And Data Engineering", year = "1993", volume = "5", issue = "6", pages = "996--998", abstract = "As computer programs grow to contain more information, it will become more important, when faced with a new system, to be able to ask, ''What do you know about?'' This correspondence paper overviews some recently completed research [1] investigating three questions: 1) what it means for a computer to know what it knows about, 2) how a computer can construct a representation of what it knows about, and 3) how such a representation ran be used for practical applications that advance the state-of-the-art in understanding the content of large databases.", keywords = "ARTIFICIAL INTELLIGENCE, COGNITIVE MODELING, COMPUTER SCIENCE, DATABASE MANAGEMENT, INFORMATION MANAGEMENT, INFORMATION RETRIEVAL, KNOWLEDGE DISCOVERY", } @InProceedings{raymer.ea:genetic-programming:96, author = "M. L. Raymer and W. F. Punch and E. D. Goodman and L. A. Kuhn", title = "Genetic Programming for Improved Data Mining: An Application to the Biochemistry of Protein Interactions", booktitle = "Genetic Programming 1996: Proceedings of the First Annual Conference", editor = "John R. Koza and David E. Goldberg and David B. Fogel and Rick L. Riolo", year = "1996", month = "28--31 " # jul, keywords = "Genetic Programming, Genetic Algorithms", pages = "375--380", address = "Stanford University, CA, USA", publisher = "MIT Press", URL = "http://isl.cps.msu.edu/GA/papers/GARAGe96-04-01.ps", size = "6 pages", note = "GP-96 Also available as TR GARAGe96-04-01", } @Article{reese-hedberg:parallelism-speeds:95, author = "S. {Reese Hedberg}", title = "Parallelism speeds data mining", journal = "IEEE parallel and distributed technology: systems and applications", volume = "3", number = "4", pages = "3--6", month = "Winter", year = "1995", ISSN = "1063-6552", classification = "C6110P (Parallel programming); C6160K (Deductive databases); C6170K (Knowledge engineering techniques); C7120 (Financial computing); C7130 (Public administration)", keywords = "6-processor; artificial intelligence; banks; cash; casinos; data mining; data pattern recognition; data processing; deductive databases; financial data processing; genetic algorithms; government; knowledge acquisition; knowledge discovery; machine learning; money laundering; multidimensional database querying; neural networks; online application processing tools; parallel processing; parallelism; pattern; prediction; query processing; rule-based; siftware; statistical techniques; stored data; Sun server; systems; techniques; transactions; US Department of Treasury", } @InProceedings{richeldi.ea:performing-effective:96, title = "Performing Effective Feature Selection by Investigating the Deep Structure of the Data", pages = "379", author = "Marco Richeldi and Pier Luca Lanzi", crossref = "simoudis.ea:proceedings-second:96", } @Book{ringland.ea:approaches-to:88, editor = "G. A. Ringland and D. A. Duce", title = "Approaches to Knowledge Representation: An Introduction", publisher = "Research studies press Ltd.", year = "1988", address = "Letchworth, England", } @Misc{risvik:discretization-numerical:97, author = "Knut Magne Risvik", title = "Discretization of Numerical Attributes", year = "1997", month = apr, howpublished = "Unpublished article. Report from undergraduate student project.", URL = "http://www.pvv.ntnu.no/~kmr/report/discretization.ps", contributedby = "Knut Magne Risvik, kmr(at)idi.ntnu.no", } @Article{rivest:learning-decision:87, author = "Ronald L. Rivest", title = "Learning Decision Lists", journal = "Machine Learning", year = "1987", volume = "2", pages = "229--246", } @TechReport{roberto-j-bayardo:dealing-with:96, author = "Roberto J. {Bayardo, Jr.}", title = "Dealing with Duplicate Tuples in Multi-Join Query Processing", institution = "The University of Texas at Austin, Department of Computer Sciences", type = "Technical Report", number = "UTEXAS.CS//CS-TR-96-11", pages = "7", month = may, year = "1996", keywords = "query processing, multi-join queries, query optimization", URL = "ftp://ftp.cs.utexas.edu/pub/techreports/tr96-11.ps.Z", abstract = "This paper presents and evaluates several schemes for handling duplicate tuple elimination during optimization and execution of large select-project-join queries. The primary issues investigated are (1) precisely when to apply duplicate tuple removal during query evaluation, and (2) how an optimizer should predict the effects of removing duplicates. We also develop a realistic model of multiple join queries inspired by a proposed data- mining application. Through experiments on this model, we find two critical techniques for high performance execution of select-project-join queries: First, the optimizer should decide where duplicates are removed within the query plan independent of the projections creating them. Second, join algorithms should remove duplicates when sorting or hashing their input, and the optimizer should be capable of predicting its effects.", } @Article{roddick.ea:handling-discovered:96, author = "J. F. Roddick and N. G. Craske and T. J. Richards", address = "Univ S Australia, Sch Comp \& Informat Sci, Adv Comp Res Ctr, Levels Campus, the Levels, Sa 5095, Australia Monash Univ, Dept Comp Technol, Caulfield, Vic 3145, Australia Qualitat Solut \& Res Pty Ltd, Bundoora, Vic 3083, Australia", title = "Handling discovered structure in database-systems", journal = "Ieee Trans. On Knowledge And Data Engineering", year = "1996", volume = "8", issue = "2", pages = "227--240", abstract = "Most database systems research assumes that the database schema is determined by a database administrator. With the recent increase in interest in knowledge discovery from databases and the predicted increase in the volume of data expected to be stored it is appropriate to reexamine this assumption and investigate how derived or induced, rather than database administrator supplied, structure can be accommodated and used by database systems. This paper investigates some of the characteristics of inductive learning and knowledge discovery as they pertain to database systems and the constraints that would be imposed on appropriate inductive learning algorithms is discussed. A formal method of defining induced dependencies (both static and temporal) is proposed as the inductive analogue to functional dependencies. The Boswell database system exemplifying some of these characteristics is also briefly discussed.", keywords = "DESIGN, RULE, INDUCTIVE DATA MODELS, KNOWLEDGE DISCOVERY, TEMPORAL INFERENCE, BOSWELL", } @InProceedings{rose.ea:reaction:93, author = "John R. Rose and Herbert Gelernter", title = "Knowledge Discovery in Reaction Databases", pages = "714--716", editor = "Bharat Bhargava and Timothy Finin and Yelena Yesha", booktitle = "Proceedings of the 2nd International Conference on Information and Knowledge Management", month = nov, publisher = "ACM Press", address = "New York, NY, USA", year = "1993", } @Article{rosen:how-good:, title = "How Good Were Those Probability Predictions?), The Expected Recommendation Loss ({ERL}) Scoring Rule", author = "David B. Rosen", note = "To appear in: Maximum Entropy and Bayesian Methods. (Proceedings of the Thirteenth International Workshop, August 1993.) G. Heidbreder, ed. Kluwer, Dordrecht, The Netherlands, 1996. 8 pages.", abstract = "We present a new way to choose an appropriate scoring rule for evaluating the performance of a _soft classifier_, i.e. of a supplier of predicted (inferred/estimated/learned/guessed) probabilities. A scoring rule (probability loss function) is a function of a single such prediction and the corresponding outcome event (true class); its expectation over the data space is the generalization performance of ultimate interest, while its sum or average over some benchmark test data set is an empirical performance measure. A user of probability predictions can apply his own decision threshold, preferring to err on one side, for example, to the extent that the consequences of an erroneous decision are more severe on the other side; this process is the subject of decision theory/analysis. We are not able to specify in advance, with certainty, these relative consequences, i.e. the user's cost matrix (indexed by decision and outcome event) defining his decision-making problem. So we represent this uncertainty itself by a distribution, from which we think of the cost matrix as being drawn. Specifying this distribution determines a uniquely appropriate scoring rule. We can interpret and characterize common scoring rules, such as the logarithmic (cross-entropy), quadratic (squared error or Brier), and the _0-1_ misclassification score, as representing different assumptions about the probability that the predictions will be used in various decision-making problems. We discuss the connection to the theory of proper (truth- or honesty-rewarding) scoring rules.", URL = "http://www.scs.unr.edu/~cbmr/people/rosen/erl", } @InProceedings{rubinstein.ea:discriminative-vs:97, title = "Discriminative vs Informative Learning", author = "Y. Dan Rubinstein and Trevor Hastie", pages = "49", crossref = "heckerman.ea:proceedings-third:97", } @InProceedings{rymon:se-trees-outperform:96, title = "{SE}-Trees Outperform Decision Trees in Noisy Domains", pages = "331", author = "Ron Rymon", crossref = "simoudis.ea:proceedings-second:96", } @InProceedings{ryu.ea:deriving-queries:96, author = "Tae-Wan Ryu and Christoph F. Eick", title = "Deriving Queries From Examples Using Genetic Programming", booktitle = "The Second International Conference on Knowledge Discovery and Data Mining (KDD-96)", editor = "Evangelos Simoudis and Jia Wei Han and Usama Fayyad", year = "1996", month = aug # " 2-4", keywords = "Genetic Programming, Genetic Algorithms, MASSON", pages = "303", address = "Portland, Oregon, USA", publisher = "AAAI", URL = "http://www.cs.uh.edu/~twryu/papers/kdd96.ps", size = "14 pages", abstract = "This paper centers on the problem of extracting intensional information for a set of objects from an object-oriented database. In our approach, the extracted intensional information for the given set of objects are described by object- oriented queries that compute this set of objects. The paper discusses the architecture of a knowledge discovery system, called MASSON, which employs genetic programming to find such queries, moreover, we will show how interesting queries that describe commonalities within a set of objects are automatically generated, modified, evaluated, and selected; we will also discuss how the search for the _best_ query is conducted by the MASSON system. We also report on an experiment that evaluated the knowledge discovery capability of MASSON.", annote = "KDD-96 http://www.aaai.org:80/Press/Proceedings/KDD/1996/kdd-96.html", affiliation = "University of Houston", crossref = "simoudis.ea:proceedings-second:96", } @InProceedings{ryu.ea:masson--discovering:96, author = "Tae-Wan Ryu and Christoph F. Eick", title = "{MASSON:} Discovering Commonalties in Collection of Objects using Genetic Programming", booktitle = "Genetic Programming 1996: Proceedings of the First Annual Conference", editor = "John R. Koza and David E. Goldberg and David B. Fogel and Rick L. Riolo", year = "1996", month = "28--31 " # jul, keywords = "Genetic Programming, Genetic Algorithms", pages = "200--208", address = "Stanford University, CA, USA", publisher = "MIT Press", URL = "http://www.cs.uh.edu/~twryu/papers/gp96.ps", size = "9 pages", abstract = "For the current flood of data, automatic tools for searching or analyzing data are necessary, especially for complex databases. Accordingly, knowledge discovery in databases is getting more and more attention. This paper centers on the problem of discovering the common characteristics that are shared by a set of objects belonging to an object-oriented database. In our approach, commonalities within a set of objects are described by object-oriented queries that compute this set of objects. The paper discusses the architecture of a knowledge discovery system, called MASSON, which employs genetic programming to find such queries, and presents an example run of the system to illustrate how the system works; we will show how interesting queries that describe commonalities within a set of objects are automatically generated, modified, evaluated, and selected; we will also discuss how the search for the _best_ query is conducted by the MASSON system. Specific problems such as the generation of constants in queries, how to cope with type violations and other constraints when creating object-oriented queries, and query evaluation are discussed in some detail.", } @InProceedings{sahami:learning-limited:96, title = "Learning Limited Dependence Bayesian Classifiers", pages = "335", author = "Mehran Sahami", crossref = "simoudis.ea:proceedings-second:96", } @Article{salzberg:on-comparing:, URL = "http://www.cs.jhu.edu/salzberg/critique.ps", title = "On Comparing Classifiers: {A} Critique of Current Research and Methods", author = "Steven Salzberg", abstract = "Experimental machine learning research needs to scrutinize its approach to experimental design. If not done very carefully, comparative studies of classification algorithms can easily result in statistically invalid conclusions. This paper describes several phenomena that can, if ignored, invalidate an experimental comparison. It also divides machine learning research into several different types, and discusses why comparative analysis is more important for some than for others.", annote = "homepage with decision tree papers is at: http://www.cs.jhu.edu/salzberg/home.html", } @InCollection{sammut.ea:learning-concepts:86, author = "Claude Sammut and Ranan B. Banerji", title = "Learning concepts by asking questions", crossref = "michalski.ea:machine-learning:86", pages = "167--191", } @InProceedings{saraee.ea:temporal-initial:95, author = "Mohamed H. Saraee and Babis Theodoulidis", title = "Knowledge Discovery in Temporal Databases: The Initial Step", booktitle = "Knowledge Discovery Workshop of the International Conference on Deductive and Object Oriented Databases Workshop (DOOD)", address = "Singapore", month = dec, year = "1995", } @Article{sasisekharan.ea:forecasting-large-scale:96, author = "R. Sasisekharan and V. Seshadri and S. M. Weiss", address = "At\&T Bell Labs, Tech Staff, Middletown, Nj, 07748 Rutgers State Univ, Dept Comp Sci, New Brunswick, Nj, 08903", title = "Data mining and forecasting in large-scale telecommunication networks", journal = "Ieee Expert-Intelligent Systems \& Their Applications", year = "1996", volume = "11", issue = "1", pages = "37--43", } @InProceedings{savnik.ea:bottom-up-induction:93, author = "I. Savnik and P. A. Flach", booktitle = "Proc. of AAAI-93 Workshop: Knowledge Discovery in Databases", title = "Bottom-up induction of functional dependencies from relations", year = "1993", URL = "ftp://martin.ijs.si/pub/CSD/Reports/CSD-TR-93-3.ps.gz", editor = "G. Piatetsky-Shapiro", keywords = "Functional dependency, Knowledge Discovery, Databases", month = jul, pages = "174--185", } @Article{scheines.ea:finding-latent:92, crossref = "ijis-special-issue:92", author = "Richard Scheines and Peter Spirtes", title = "Finding Latent Variable Models in Large Databases", pages = "609--621", } @Article{schmitz.ea:coverstory--automated:90, author = "J. Armstrong Schmitz and Little. J. D. C.", title = "CoverStory- Automated news finding in marketing", journal = "Decision Support Systems Transaction", year = "1990", page = "46--54", keywords = "marketing, sales data, cranberry, ocean spray", } @InCollection{scholkopf.ea:extracting-support:95, author = "B. Sch{\"o}lkopf and C. Burges and V. Vapnik", title = "Extracting support data for a given task", booktitle = "Proceedings, First International Conference on Knowledge Discovery and Data Mining", publisher = "AAAI Press", address = "Menlo Park, CA", editor = "U. M. Fayyad and R. Uthurusamy", year = "1995", } @Article{serge.ea:book-review:93, author = "Alberto Serge and Geoffrey Gordon", title = "Book Review of Computer Systems That Learn.", journal = "Artificial Intelligence", year = "1993", number = "62", pages = "363--378", annote = "Review / overview of Computer Systems That Learn by Sholom M. Weiss and Casimir A. Kulikowski. Identifies dimensions of classification of learning methods. Looks at Neural Nets, Statistical Methods and Machine Learning approaches (ID3, CART, C4). 17 References.", } @Article{sestito.ea:using-single:91, author = "Sabrina Sestito and Tharam Dillon", title = "Using single layered neural networks for the extraction of conjunctive rules and hierarchical classifications", journal = "Journal of Applied Intelligence", year = "1991", pages = "157--173", volume = "1", } @Proceedings{shadbolt.ea:advances-acquisition:96, editor = "Nigel Shadbolt and Kieron O'Hara and Guus Schreiber", booktitle = "Advances in knowledge acquisition: 9th European Knowledge Acquisition Workshop, {EKAW} '96, Nottingham, United Kingdom, May 14--17, 1996: proceedings", title = "Advances in knowledge acquisition: 9th European Knowledge Acquisition Workshop, {EKAW} '96, Nottingham, United Kingdom, May 14--17, 1996: proceedings", volume = "1076", publisher = "Springer-Verlag Inc.", address = "New York, NY, USA", pages = "xii + 369", year = "1996", ISBN = "3-540-61273-4 (softcover)", ISSN = "0302-9743", LCCN = "QA76.73.E95 E92 1996", series = "Lecture Notes in Artificial Intelligence and Lecture Notes in Computer Science", annote = "Assumptions of problem-solving methods / Richard Benjamins and Christine Pierret-Golbreich -- Problem-solving methods: making assumptions for efficiency reasons / Dieter Fensel and Remco Straatman -- The thin end of the wedge: efficiency and the generalised directive model methodology / Kieron O'Hara and Nigel Shadbolt -- Principles for libraries of task decomposition methods: conclusions from a case-study / Klas Orsvarn -- A purpose driven method for language comparison / The REVISE Project -- A conceptual and formal model of a diagnostic reasoner / Richard Benjamins and Manfred Aben -- Ontology construction for technical domains / Jan Benjamin \ldots{} [et al.] -- Text clustering to help knowledge acquisition from documents / Stephane Lapalut -- A quality-based terminological reasoning model for text knowledge acquisitions / Udo Hahn, Manfred Klenner and Klemens Schnattinger -- Extracting conceptual knowledge from text using explicit relation markers / Paul R. Bowden, Peter Halstead and Tony G. Rose -- Structuring information in a distributed hypermedia system / Celia Ghedini Ralha -- Diagrammatic knowledge acquisition: elicitation, analysis and issues / Peter C.-H. Cheng -- An approach to measuring theory quality / Edgar Sommer -- Some late- breaking news from the data mines and a preview of the KOALA system: a prospector's report / Franz Schmalhofer and Christoph Kozieja -- A knowledge acquisition tool for multi-perspective concept formation / Joao Jose Furtado Vasco, Colette Faucher and Eugene Chouraqui -- Knowledge discovery in databases: exploiting knowledge-level redescription / James Cupit and Nigel Shadbolt -- Towards painless knowledge acquisition / Derek Sleeman and Fraser Mitchell -- The acquisition of a shared task model / Frances Brazier, Jan Treur and Niek Wijngaards -- The group elicitation method: an introduction / Guy Boy -- Formalising the repair of schedules through knowledge acquisition / Janet Efstathiou -- Intelligent tools for planning knowledge base development and verification / Steve A. Chien -- Configuring service recovery planning with the CommonKADS library / V. Arlanzon, A. Bernaras and I. Laresgoiti -- Domain and system influences in problem solving models for planning / Hugh Cottam and Nigel Shadbolt.", keywords = "Knowledge acquisition (Expert systems) -- Congresses.", } @Article{shan.ea:data-based-acquisition:95, author = "N. Shan and W. Ziarko", address = "Univ Regina, Dept Comp Sci, Regina, Sk S4S 0A2, Canada", title = "Data-based acquisition and incremental modification of classification rules", journal = "Computational Intelligence", year = "1995", volume = "11", issue = "2", pages = "357--370", abstract = "One of the most important problems in the application of knowledge discovery systems is the identification and subsequent updating of rules. Many applications require that the classification rules be derived from data representing exemplar occurrences of data patterns belonging to different classes. The problem of identifying such rules in data has been researched within the field of machine learning, and more recently in the context of rough set theory and knowledge discovery in databases. In this paper we present an incremental methodology for finding all maximally generalized rules and for adaptive modification of them when new data become available. The methodology is developed in the context of rough set theory and is based on the earlier idea of discernibility matrix introduced by Skowron.", keywords = "ROUGH SETS, DECISION RULES, KNOWLEDGE DISCOVERY, MACHINE LEARNING, INCREMENTAL LEARNING, ADAPTIVE SYSTEMS", } @InProceedings{shan.ea:discovering-classification:96, title = "Discovering Classification Knowledge in Databases Using Rough Sets", pages = "271", author = "Ning Shan and Wojciech Ziarko and Howard J. Hamilton and Nick Cercone", crossref = "simoudis.ea:proceedings-second:96", } @Book{shannon.ea:mathematical-theory:49, author = "Claude E. Shannon and Warren Weaver", title = "The mathematical theory of communication", publisher = "University of Illinois Press", year = "1949", annote = "Book by the father of information theory.", } @Book{shapiro:encyclopedia-artificial:92, editor = "Stuart C. Shapiro", title = "Encyclopedia of artificial intelligence", publisher = "Wiley", year = "1992", } @TechReport{shavlik.ea:combining-explanation-based:89, author = "Jude W. Shavlik and Geoffrey G. Towell", title = "Combining Explanation-based and Neural Learning: An algorithm and Emperical Results", institution = "University of Wisconsin", number = "859", month = jun, year = "1989", } @TechReport{shavlik.ea:symbolic-neural:90, author = "Jude W. Shavlik and Raymond J. Mooney and Geoffrey G. Towell", title = "Symbolic and Neural Learning Algorithms: An Experimental Comparison (Revised)", institution = "Department of Computer Sciences, University of Wisconsin", number = "Technical Report No. 955 (August 1990)", year = "1990", keywords = "Empirical Learning, Connectionism, Neural Networks, Inductive Learning, ID3, Perceptron, Backpropagation", annote = "To Appear in Machine Learning, Volume 6, 1991. Comparison of ID3, Backprop and Perceptron on 5 large, real-world data sets.", } @InProceedings{shek.ea:scalable-exploratory:96, title = "Scalable Exploratory Data Mining of Distributed Geoscientific Data", pages = "32", author = "Eddie C. Shek and Richard R. Muntz and Edmond Mesrobian and Kenneth Ng", crossref = "simoudis.ea:proceedings-second:96", } @Article{shen.ea:metapattern-based-automated:96, author = "W. M. Shen and B. Leng", address = "Univ So Calif, Inst Informat Sci, 4676 Admiralty Way, Marina Del Rey, Ca, 90292 Univ So Calif, Dept Comp Sci, Marina Del Rey, Ca, 90292 Inference Corp, Chicago, Il, 60631", title = "A metapattern-based automated discovery loop for integrated data mining - unsupervised learning of relational patterns", journal = "Ieee Trans. On Knowledge And Data Engineering", year = "1996", month = dec, volume = "8", issue = "6", pages = "898--910", abstract = "Metapattern (also known as metaquery) is a new approach for integrated data mining systems. Different from a typical ''tool-box'' like integration, where components must be picked and chosen by users without much help, metapatterns provide a common representation for intercomponent communication as well as a human interface for hypothesis development and search control. One weakness of this approach, however, is that the task of generating fruitful metapatterns is still a heavy burden for human users. In this paper, we describe a metapattern generator and an integrated discovery loop that can automatically generate metapatterns. Experiments in both artificial and real-world databases have shown that this new system goes beyond the existing machine learning technologies, and can discover relational patterns without requiring humans to prelabel the data as positive or negative examples for some given target concepts. With this technology, future data mining systems could discover high- quality, human comprehensible knowledge in a much more efficient and focused manner, and data mining could be managed easily by both expert and less expert users.", keywords = "induction, deduction, human interaction, integration, unsupervised learning, relational concepts, metaquery, metapattern", } @InProceedings{shen.ea:metapattern-generation:96, title = "Metapattern Generation for Integrated Data Mining", pages = "152", author = "Wei-Min Shen and Bing Leng", crossref = "simoudis.ea:proceedings-second:96", } @InProceedings{shen:complementary-discrimination:92, author = "W. M. Shen", title = "Complementary Discrimination Learning with Decision Lists", year = "1992", booktitle = "Proc.\ of AAAI-92", pages = "153--158", } @Article{shen:discovering-regularities:92, crossref = "ijis-special-issue:92", author = "Wei-Min Shen", title = "Discovering Regularities from Knowledge Bases", pages = "623--635", } @Article{sheridan:survey-techniques:91, author = "F. K. J. Sheridan", title = "A survey of techniques for inference under uncertainty", journal = "Artificial Intelligence Review", year = "1991", pages = "89--119", volume = "5", } @InProceedings{shimakawa.ea:trend-recognition:, author = "Hiromitsu Shimakawa and Kouji Kikkawa", title = "Trend Recognition with Time Series Database", booktitle = "FUTURE DATABASES 92", volume = "3", publisher = "WORLD SCIENTIFIC PUBL CO PTE LTD,SINGAPORE", chapter = "46", pages = "373--383", annote = "2ND FAR-EAST WORKSHOP ON FUTURE DATABASE SYSTEMS KYOTO,JAPAN D920426-28", } @InProceedings{shinghal:evaluating-interestingness:96, title = "Evaluating the Interestingness of Characteristic Rules", pages = "263", author = "Micheline Kamberand Rajjan Shinghal", crossref = "simoudis.ea:proceedings-second:96", } @Article{shortland.ea:applications-bt:94, author = "R. Shortland and R. Scarfe", address = "British Telecommun Labs, Martlesham Heath, Ipswich 1P5 7Re, Suffolk, England", title = "Data mining applications in bt", journal = "Bt Technology J.", year = "1994", volume = "12", issue = "4", pages = "17--22", abstract = "With the increased use of computers there is an ever increasing volume of data being generated and stored. This can lead to companies becoming 'data rich and information poor'. This paper describes how BT has used data mining techniques to convert volume data into high- value information which can be used to aid decision making in a number of key business processes. The benefit of actively using data, as opposed to passively storing it, is demonstrated via a number of case studies which cover areas as diverse as fault diagnosis, fraud detection, market segmentation, credit vetting and litigation assessment.", } @InProceedings{siebes.ea:keso-minimizing:97, title = "{KESO}: Minimizing Database Interaction", author = "Arno Siebes and Martin L. Kersten", pages = "247", crossref = "heckerman.ea:proceedings-third:97", } @TechReport{siebes:homogeneous-discoveries:, URL = "ftp://ftp.cwi.nl/pub/CWIreports/AA/CS-R9430.ps.Z", title = "Homogeneous Discoveries Contain no Surprises: Inferring Risk-profiles from Large Databases", author = "Arno Siebes", abstract = "Many models of reality are probabilistic. For example, not everyone orders crisps with their beer, but a certain percentage does. Inferring such probabilistic knowledge from databases is one of the major challenges for data mining.

Recently Agrawal et al. investigated a class of such problems. In this paper a new class of such problems is investigated, viz., inferring risk-profiles. The proto-typical example of this class is: ``what is the probability that a given policy-holder will file a claim with the insurance company in the next year''. A risk-profile is then a description of a group of insurants that have the same probability for filing a claim.

It is shown in this paper that homogeneous descriptions are the most plausible risk-profiles. Moreover, under modest assumptions it is shown that covers of such homogeneous descriptions are essentially unique. A direct consequence of this result is that it suffices to search for the homogeneous description with the highest associated probability.

The main result of this paper is thus that we show that the inference problem for risk-profiles reduces to the well studied problem of maximising a quality function.", annote = "CR subject classification (1991): Computer based methods in probability and statistics (G.3), Database applications (H.2.8), Information search and retrieval (H.3.3) clustering, search process, Learning (I.2.6) concept learning, induction, knowledge acquisition", keywords = "Data Mining, Probabilistic Knowledge, Probabilistic Search, Probability Theory", } @Article{silberschatz.ea:what-makes:96, author = "A. Silberschatz and A. Tuzhilin", address = "At\&T Bell Labs, Lucent Technol, 600 Mt Ave, Murray Hill, Nj, 07974 Nyu, Stern Sch Business, Dept Informat Syst, New York, Ny, 10012", title = "What makes patterns interesting in knowledge discovery systems", journal = "Ieee Trans. On Knowledge And Data Engineering", year = "1996", volume = "8", issue = "6", pages = "970--974", abstract = "One of the central problems in the field of knowledge discovery is the development of good measures of interestingness of discovered patterns. Such measures of interestingness are divided into objective measures-those that depend only on the structure of a pattern and the underlying data used in the discovery process, and the subjective measures-those that also depend on the class of users who examine the pattern. The focus of this paper is on studying subjective measures of interestingness. These measures are classified into actionable and unexpected, and the relationship between them is examined. The unexpected measure of interestingness is defined in terms of the belief system that the user has. Interestingness of a pattern is expressed in terms of how it affects the belief system. The paper also discusses how this unexpected measure of interestingness can be used in the discovery process.", keywords = "measures of interestingness, patterns, actionability, unexpectedness, belief systems", } @Proceedings{simoudis.ea:proceedings-second:96, title = "Proceedings of the Second International Conference on Knowledge Discovery and Data Mining ({KDD}-96)", year = "1996", editor = "Evangelos Simoudis and Jia Wei Han and Usama Fayyad", publisher = "AAAI Press", } @Article{simoudis:reality-check:96, author = "E. Simoudis", address = "Ibm Corp, Almaden Res Ctr, 650 Harry Rd, San Jose, Ca, 95120", title = "Reality check for data mining", journal = "Ieee Expert-Intelligent Systems \& Their Applications", year = "1996", volume = "11", issue = "5", pages = "26--33", } @Article{skowron:extracting-laws:95, author = "A. Skowron", address = "Warsaw Univ, Inst Math, Banacha 2, Pl-02097 Warsaw, Poland", title = "Extracting laws from decision tables - a rough set approach", journal = "Computational Intelligence", year = "1995", volume = "11", issue = "2", pages = "371--388", abstract = "We present some methods, based on the rough set and Boolean reasoning approaches, for extracting laws from decision tables. First we discuss several procedures for decision rules synthesis from decision tables. Next we show how to apply some near-to-functional relations between data to data filtration. Two methods of searching for new classifiers (features) are described: searching for new classifiers in a given set of logical formulas, and searching for some functions approximating near-to-functional relations.", keywords = "REASONING UNDER UNCERTAINTY, ROUGH SETS, KNOWLEDGE DISCOVERY, MACHINE LEARNING", } @InProceedings{smyth.ea:anytime-exploratory:97, title = "Anytime Exploratory Data Analysis for Massive Data Sets", author = "Padhraic Smyth and David Wolpert", pages = "54", crossref = "heckerman.ea:proceedings-third:97", } @InProceedings{smyth.ea:detecting-atmospheric:97, title = "Detecting Atmospheric Regimes Using Cross-Validated Clustering", author = "Padhraic Smyth and Michael Ghil and Kayo Ide and Joe Roden and Andrew Fraser", pages = "61", crossref = "heckerman.ea:proceedings-third:97", } @Article{smyth.ea:information-theoretic:92, author = "P. Smyth and R. M. Goodman", address = "Caltech, Jet Propuls Lab 238420, Commun Syst Res Sect, Pasadena, Ca, 91109 Caltech, Dept Elect Engn, Pasadena, Ca, 91125", title = "An information theoretic approach to rule induction from databases", journal = "Ieee Trans. On Knowledge And Data Engineering", year = "1992", volume = "4", issue = "4", pages = "301--316", abstract = "The knowledge acquisition bottleneck in obtaining rules directly from an expert is well known. Hence, the problem of automated rule acquisition from data is a well-motivated one, particularly for domains where a database of sample data exists. In this paper we introduce a novel algorithm for the induction of rules from examples. The algorithm is novel in the sense that it not only learns rules for a given concept (classification), but it simultaneously learns rules relating multiple concepts. This type of learning, known as generalized rule induction is considerably more general than existing algorithms which tend to be classification oriented. Initially we focus on the problem of determining a quantitative, well-defined rule preference measure. In particular, we propose a quantity called the J-measure as an information theoretic alternative to existing approaches. The J-measure quantifies the information content of a rule or a hypothesis. We will outline the information theoretic origins of this measure and examine its plausibility as a hypothesis preference measure. We then define the ITRULE algorithm which uses the newly proposed measure to learn a set of optimal rules from a set of data samples, and we conclude the paper with an analysis of experimental results on real-world data.", keywords = "PRINCIPLE, INFERENCE, EXPERT, CROSS ENTROPY, EXPERT SYSTEMS, INFORMATION THEORY, MACHINE LEARNING, KNOWLEDGE ACQUISITION, KNOWLEDGE DISCOVERY, RULE-BASED SYSTEMS, RULE INDUCTION", month = aug, annote = "Describes the ITRULE system which uses the J-measure of average information content of a rule. Also develops a bound on the J-measure which is used to stop specialisation of a rule when no further improvement can be made. Info on the application of ITRULE to computer analysis of Bach can be found at http://www.gold.net/online/archive/940929_Bach.html", } @InProceedings{smyth:clustering-using:96, title = "Clustering Using Monte Carlo Cross-Validation", pages = "126", author = "Padhraic Smyth", crossref = "simoudis.ea:proceedings-second:96", } @InProceedings{soderland:learning-to:97, title = "Learning to Extract Text-Based Information from the World Wide Web", author = "Stephen Soderland", pages = "251", crossref = "heckerman.ea:proceedings-third:97", } @Book{soucek:neural-intelligent:91, author = "Branko Soucek", title = "Neural and Intelligent Systems Integration: Fifth and Sixth Generation Integrated Reasoning Information Systems", series = "Sixth Generation Computer Technologies Series", pages = "688", publisher = "Wiley-Interscience", year = "1991", keywords = "book, text,", abstract = "** Description ** Combines new techniques of software automation, system adaptation, module selection, self-organization and automated discovery. Presents results from the IRIS Group--findings from American, European, Korean and Japanese projects on this emerging discipline. Explores methods of combining well-defined intelligent modules for integration into intelligent systems. Modules include intelligent algorithms and programs, neural networks and computing elements, fuzzy data comparators and correlators, spare distributed memories, expert systems, intelligent databases, associative and parallel processing units, and data acquisition, control and robot units.\par ** Partial Contents ** NEURAL, GENETIC, AND INTELLIGENT ALGORITHMS AND COMPUTING ELEMENTS. From Modules to Application-Oriented Integrated Systems (B. Soucek). Neural Network Models of Concept Learning (P. Schyns). Teaching Network Connections for Real-Time Object Recognition (S. Wilson). Neural Networks on Parallel Computers (H. Yoon, et al.). Neural Bit-Slice Computing Element (J. Yestrebsky, et al.). INTEGRATED NEURAL-KNOWLEDGE-FUZZY HYBRIDS. Fuzzy Data Comparator with Neural Network Postprocessor: A Hardware Implementation (P. Basehore, et al.). Injecting Symbol Processing Into a Connectionist Model (S. Romaniuk \& L. Hall). INTEGRATED REASONING, INFORMING, AND SERVING SYSTEMS. An Advanced Software Paradigm for Intelligent Systems Integration (T. Ichiko). Intelligent Data Base and Automatic Discovery (K. Parsaye, et al.). Index.\par ** Market ** Practicing Engineers and Scientists, Students, Researchers.", note = "I-0-471-53676-8 1991cloth \$89.95", } @InProceedings{spears.ea:using-genetic:90, author = "William M. Spears and Kenneth De Jong", title = "Using genetic algorithms for supervised concept learning", booktitle = "Proceedings of tools for AI", organisation = "IEEE", year = "1990", } @InProceedings{srikant.ea:association-rules:97, title = "Mining Association Rules with Item Constraints", author = "Ramakrishnan Srikant and Quoc Vu and Rakesh Agrawal", pages = "67", crossref = "heckerman.ea:proceedings-third:97", } @InProceedings{srikant.ea:quantitative-association:96, title = "Mining Quantitative Association Rules in Large Relational Tables", author = "Ramakrishnan Srikant and Rakesh Agrawal", editor = "H. V. Jagadish and Inderpal Singh Mumick", booktitle = "Proceedings of the 1996 {ACM} {SIGMOD} International Conference on Management of Data", address = "Montreal, Quebec, Canada", month = "4--6~" # jun, year = "1996", pages = "1--12", } @InProceedings{stolfo.ea:jam-java:97, title = "{JAM}: Java Agents for Meta-Learning over Distributed Databases", author = "Salvatore Stolfo and Andreas L. Prodromidis and Shelley Tselepis and Wenke Lee and Dave W. Fan and Philip K. Chan", pages = "74", crossref = "heckerman.ea:proceedings-third:97", } @InProceedings{stolorz.ea:harnessing-graphical:96, title = "Harnessing Graphical Structure in Markov Chain Monte Carlo Learning", pages = "134", author = "Paul E. Stolorz and Philip C. Chew", crossref = "simoudis.ea:proceedings-second:96", } @InProceedings{stolorz.ea:quakefinder-scalable:96, title = "Quakefinder: {A} Scalable Data Mining System for Detecting Earthquakes from Space", pages = "208", author = "Paul Stolorz and Christopher Dean", crossref = "simoudis.ea:proceedings-second:96", } @InProceedings{stolorz:fast-spatio-temporal-data-mining-of-large-geophysical-datasets:95, author = "P. Stolorz", title = "{Fast Spatio-Temporal Data Mining of Large Geophysical Datasets}", booktitle = "Proceedings of the First International Conference on Knowledge Discovery and Data Mining (KDD)", year = "1995", address = "Montreal, Canada", month = aug, publisher = "AAAI Press", editor = "U. M. Fayyad and R. Uthurusamy", } @InProceedings{stough.ea:image-feature:97, title = "Image Feature Reduction through Spoiling: Its Application to Multiple Matched Filters for Focus of Attention", author = "Timothy M. Stough and Carla E. Brodley", pages = "255", crossref = "heckerman.ea:proceedings-third:97", } @InProceedings{subramonian.ea:visual-interactive:97, title = "A Visual Interactive Framework for Attribute Discretization", author = "Ramesh Subramonian and Ramana Venkata and Joyce Chen", pages = "82", crossref = "heckerman.ea:proceedings-third:97", } @InProceedings{suzuki.ea:exceptional-based:96, title = "Exceptional Knowledge Discovery in Databases Based on Information Theory", pages = "275", author = "Einoshin Suzuki and Masamichi Shimura", crossref = "simoudis.ea:proceedings-second:96", } @InProceedings{suzuki:autonomous-reliable:97, title = "Autonomous Discovery of Reliable Exception Rules", author = "Einoshin Suzuki", pages = "259", crossref = "heckerman.ea:proceedings-third:97", } @Misc{swami:with-silicon:, title = "Data Mining with Silicon Graphics Technology", author = "Arun Swami", URL = "http://www-europe.sgi.com/Technology/data-mining.html", howpublished = "www publication", } @InProceedings{swanson.ea:undiscovered-public:96, title = "Undiscovered Public Knowledge: {A} Ten-Year Update", pages = "295", author = "Don R. Swanson and Neil R. Smalheiser", crossref = "simoudis.ea:proceedings-second:96", } @Article{szolovits:uncertainty-decisions:95, author = "P. Szolovits", address = "Mit, Comp Sci Lab, 545 Technol Sq, Cambridge, Ma, 02139", title = "Uncertainty and decisions in medical informatics", journal = "Methods Of Information In Medicine", year = "1995", volume = "34", issue = "1-2", pages = "111--121", abstract = "This paper presents a tutorial introduction to the handling of uncertainty and decision-making in medical reasoning systems. It focuses on the central role of uncertainty in all of medicine and identifies the major themes that arise in research papers. It then reviews simple Bayesian formulations of the problem and pursues the generalization to the Bayesian network methods that are popular today. Decision making is presented from the decision analysis viewpoint, with brief mention of recently-developed methods. The paper concludes with review of more abstract characterization of uncertainty, and anticipates the growing importance of analytic and ''data mining'' techniques as growing amounts of clinical data become widely available.", keywords = "COMPUTER, NETWORKS, MODEL, DECISION SUPPORT, UNCERTAINTY, BAYES, GRAPH MODELS, DECISION TREES, INFLUENCE DIAGRAMS", } @Article{tattersall.ea:techniques:94, author = "G. D. Tattersall and P. R. Limb", address = "British Telecommun Labs, Ipswich 1P5 7Re, Suffolk, England", title = "Visualization techniques for data mining", journal = "Bt Technology J.", year = "1994", volume = "12", issue = "4", pages = "23--31", abstract = "BT collects and stores large quantities of data from a variety of sources. These large data sets typically describe different states of a system and are difficult to interpret because there is no obvious way of abstracting and presenting data features in a meaningful way for a human observer. Data mining is a term which has recently become popular to describe techniques for the exploration and exploitation of data. In particular, a large part of data mining involves the visualisation of data and subsequent utilisation of machine- learning techniques for classification of data. This paper describes some techniques for data visualisation which enable the user to enhance understanding of the structure and properties of (often multidimensional) data prior to applying machine-learning techniques for further analysis and exploration.", } @Article{teller.ea:program-evolution:95, author = "Astro Teller and Manuela Veloso", title = "Program Evolution for Data Mining", editor = "Sushil Louis", publisher = "JAI Press", journal = "The International Journal of Expert Systems", year = "1995", volume = "8", number = "3", pages = "216--236", keywords = "genetic algorithms, genetic programming, memory", URL = "http://www.cs.cmu.edu/afs/cs/usr/astro/public/papers/Astro-ESJ.ps", url_2 = "ftp://cs.ucl.ac.uk/genetic/papers/Astro-ESJ.ps.Z", abstract = "Around the world there are innumerable databases of information. The quantity of information available has created a high demand for automatic methods for searching these databases and extracting specific kinds of information. Unfortunately, the information in these databases increasingly contains signals that have no corresponding classification symbols. Examples include databases of images, sounds, etc. A few systems have been written to help solve these search and retrieve issues. But we can not write a new system for every kind of signal we want to recognize and extract. Some work has been done on automating (i.e. learning) the task of identifying desired signal elements. It would be useful to automate (learn) not just a part of the classification function, but the entire signal identification program. It would be helpful if we could use the same learning architecture to automatically create these programs for distinguishing many different classes of the same signal type. It would be better still if we could use the same learning architecture to create these programs even for signal types as different as images and sound waves. We introduce PADO (Parallel Architecture Discovery and Orchestration), a learning architecture designed to deliver this. PADO has at its core a variant of genetic programming (GP) that extends the paradigm to explore the space of algorithms. PADO learns the entire classification algorithm for an arbitrary signal type with arbitrary signal class distinctions. This architecture has been designed specifically for signal understanding and classification. The architecture of PADO and its achievements on the recovery of visual and acoustic signal classes from test databases are the subjects of this article.", note = "Third Quarter. Special Issue on Genetic Algorithms and Knowledge Bases.", } @InProceedings{terano.ea:interactive-marketing:96, title = "Interactive Knowledge Discovery from Marketing Questionnaire Using Simulated Breeding and Inductive Learning Methods", pages = "279", author = "Takao Terano and Yoko Ishino", crossref = "simoudis.ea:proceedings-second:96", } @InProceedings{thearling:its-all:97, author = "Kurt Thearling", title = "Data Mining \& Visualization: It's All In the Interaction", booktitle = "Integration of Data Mining and Data Visualization workshop , held in conjunction with both the KDD'97 and Visualization '97", year = "1997", URL = "http://www.santafe.edu/~kurt/dmviz.shtml", note = "Position Paper", annote = "What is the point of visualization? It's pretty simple: to let the user understand what is going on. Since data mining usually involves extracting ``hidden'' information from a database, the understanding process can get a bit complicated. The key is to put the user in a context they feel comfortable in and then let them poke and prod until they understand what they didn't see before.", } @InProceedings{thomas.ea:efficient-algorithm:97, title = "An Efficient Algorithm for the Incremental Updation of Association Rules in Large Databases", author = "Shiby Thomas and Sreenath Bodagala and Khaled Alsabti and Sanjay Ranka", pages = "263", crossref = "heckerman.ea:proceedings-third:97", } @TechReport{thrun.ea:monks-problems:91, author = "S. B. Thrun and et al.", title = "The {MONK}'s Problems, {A} Performance Comparison of Different Learning Algorithms", institution = "Carnegie Mellon University", number = "CMU-CS-91-197", month = dec, year = "1991", annote = "Over 100 pages, this report is really 9 short reports each evaluating the performance of a set of learning algorithms on standard test data. Contains descriptions of all the algorithms used. Packed full of references. [from the abstract]This report summarizes a comparison of different learning techniques which was performed at the 2nd European Summer School on Machine Learning, held in Belgium during summer 1991. A variety of symbolic and non-symbolic learning techniques - namely AQ17-DCL, AQ17-HCI, AQ17-FCLS, AQ14-NT, AQ15-GA, Assistant Professional, mFOIL, ID5R, IDL,ID5R-hat, TDIDT, ID3, AQR, CN2, CLASSWEB, ECOBWEB, PRISM, Backpropagation, and Cascade Correlation - are compared on three classification problems, the MONK's problems.", } @Article{towell.ea:extraction-refined:93, author = "Geoffrey G. Towell and Jude W. Shavlik", title = "The Extraction of Refined Rules from Knowledge-Based Neural Networks", journal = "Machine Learning", year = "1993", volume = "13", number = "1", pages = "71--101", } @Article{tsujino.ea:acquisition-driven:, author = "Katsuhiko Tsujino and Vlad G. Dabija and Shogo Nishida", title = "Knowledge Acquisition Driven by Constructive and Interactive Induction", keywords = "Kaiser, meta-knowledge", } @Article{tsumoto.ea:application-rough:96, author = "S. Tsumoto and W. Ziarko", title = "The application of rough sets-based data mining technique to differential diagnosis of meningoenchephalitis", journal = "Lecture Notes in Computer Science", volume = "1079", pages = "438--??", year = "1996", ISSN = "0302-9743", } @InProceedings{tsumoto.ea:automated-medical:96, title = "Automated Discovery of Medical Expert System Rules from Clinical Databases Based on Rough Sets", pages = "63", author = "Shusaku Tsumoto and Hiroshi Tanaka", crossref = "simoudis.ea:proceedings-second:96", } @InProceedings{turmon.ea:bayesian-inference:97, title = "Bayesian Inference for Identifying Solar Active Regions", author = "Michael Turmon and Saleem Mukhtar and Judit Pap", pages = "267", crossref = "heckerman.ea:proceedings-third:97", } @Article{ullman.ea:deductive-achievements:90, author = "Jeffrey D. Ullman and Carlo Zaniolo", title = "Deductive databases: achievements and future directions", journal = "SIGMOD Record (ACM Special Interest Group on Management of Data)", volume = "19", number = "4", pages = "75--82", month = dec, year = "1990", ISSN = "0163-5808", abstract = "The key concepts behind deductive databases and their newly developed enabling technology are reserved. The declarative programming approach used for such databases is examined at length. Current research on extending the functionality and usability of deductive databases and on providing a synthesis of deductive databases with procedural and object-oriented approaches are described.", affiliation = "Stanford Univ", affiliationaddress = "Stanford, CA, USA", classification = "723; C6160Z (Other DBMS); C6170 (Expert systems)", keywords = "Database Systems; Reviews; Computer Programming; Deductive Databases; Declarative Programming; Objected-Oriented Programming; Procedural Programming, Procedural databases; Declarative queries; Deductive databases; Rule-based style; Knowledge mining; Computer-aided design; Enabling technology; Object-oriented approaches", thesaurus = "Deductive databases", } @InProceedings{ullman:efficient-implementation:96, title = "Efficient Implementation of Data Cubes Via Materialized Views", pages = "386", author = "Jeffrey D. Ullman", crossref = "simoudis.ea:proceedings-second:96", } @Misc{university-of-warsaw.ea:papers-on:, URL = "ftp://ftp.ii.pw.edu.pl/pub/Reports", title = "Papers on Rough Sets Theory and Information Systems", author = "a team working in University of Warsaw and Warsaw Uniwersity of Technology.", annote = "The main field of our work is Rough Sets Theory and Information Systems. Roughly speaking, our work is to find dependances in experimental datasets to simulate decision processes with high quality.", } @InProceedings{urpani.ea:ritio-rule:96, title = "{RITIO} - Rule Induction Two In One", pages = "339", author = "David Urpani and Xindong Wu and Jim Sykes", crossref = "simoudis.ea:proceedings-second:96", } @Article{utgoff:incremental-induction:89, author = "P. E. Utgoff", title = "Incremental Induction of Decision Trees", journal = "Machine Learning", year = "1989", volume = "4", month = "161-186", keywords = "ID3, ID5R", } @InCollection{utgoff:shift-bias:86, author = "Paul E. Utgoff", title = "Shift of Bias for Inductive Concept Learning", crossref = "michalski.ea:machine-learning:86", pages = "107--148", } @Article{valiant:theory-learnable:84, author = "Leslie G. Valiant", title = "A Theory of the Learnable", journal = "Communications of the ACM", volume = "27", number = "11", pages = "1134--1142", month = nov, year = "1984", ISSN = "0001-0782", abstract = "Humans appear to be able to learn new concepts without needing to be programmed explicitly in any conventional sense. In this paper we regard learning as the phenomenon of knowledge acquisition in the absence of specific programming. We give a precise methodology for studying this phenomenon from a computational viewpoint. It consists of choosing an appropriate information gathering mechanism, the learning protocol, and exploring the class of concepts that can be learned using it in a reasonable (polynomial) number of steps. Although inherent algorithmic complexity appears to set serious limits to the range of concepts that can be learned, we show that there are some important nontrivial classes of propositional concepts that can be learned in a realistic sense.", comment = "Defines `learnability' wrt EXAMPLES and ORACLE using arbitrary probability measure on event space. Shows k-CNF learnable from examples only.", } @Article{vaughn:interpretation-multilayer:96, author = "M. L. Vaughn", address = "Cranfield Univ, Comp Informat Syst Management Grp, Rmcs, Swindon Sn6 8La, Wilts, England", title = "Interpretation and knowledge discovery from the multilayer perceptron network - opening the black-box", journal = "Neural Computing \& Applications", year = "1996", volume = "4", issue = "2", pages = "72--82", abstract = "This paper interprets the outputs from the multilayer perceptron (MLP) network by finding the input data features at the input layer of the network which activate the hidden layer feature detectors. This leads directly to the deduction of the significant data inputs, the inputs that the network actually uses to perform the input/output mapping for a classification task, and the discovery of the most significant of these data inputs. The analysis presents a method for providing explanations for the network outputs and for representing the knowledge learned by the network in the form of significant input data relationships. During network development the explanation facilities and data relationships can be used for network validation and verification, and after development, for rule induction and data mining where this method provides a potential tool for knowledge discovery in databases (KDD).", keywords = "DATA MINING, EXPLANATION FACILITIES, INTERPRETATION, KNOWLEDGE DISCOVERY, RULE INDUCTION, VALIDATION AND VERIFICATION", } @Article{vickery:introductory-review:97, author = "B. Vickery", address = "Univ Coll London, Mortimer St, London Wc1E 6Bt, England", title = "Knowledge discovery from databases: an introductory review", journal = "J. Of Documentation", year = "1997", volume = "53", issue = "2", pages = "107--122", abstract = "The paper aims to provide a non-technical introduction to the new procedures being used to extract knowledge from databases. The reasons for developing knowledge discovery methods are discussed - primarily, the current production of very large databases that may include many data relations not explicit in the database structure. The background in machine learning is indicated. The methods used are described for such techniques as classification (sorting data into predefined classes), clustering (developing ab initio a data classification) and the detection of deviations from pre- established norms. Examples of the applications of these methods are given. The paper concludes with some brief thoughts about the potential use of knowledge discovery in the information field.", keywords = "KDD, introduction, review", } @Proceedings{vijayaraman.ea:proceedings-22nd:96, editor = "T. M. Vijayaraman and A. Buchmann and C. Mohan and N. L. Sarda", title = "Proceedings of the 22nd International Conference on Very Large Data Bases", address = "San Francisco", year = "1996", ISBN = "1-55860-382-4", descriptor = "Data Mining, Anfragenbearbeitung, Raeumlicher Zugriffspfad, Datenbank, VLDB", } @Article{walker:how-feasible:87, author = "Michael G. Walker", title = "How feasible is automated discovery", journal = "IEEE Expert", year = "1987", pages = "69--82", volume = "Spring 1987", annote = "Looks at Meta-dendral, RX (radix), Bacon, Prospector, AM. Compares them on Domain Knowledge, Search method, Search Representation, Data Driven vs Model driven discovery, Hypothesis Testing, Signal to Noise Ratio", } @InProceedings{wang.ea:automated-active:96, title = "Automated Discovery of Active Motifs in Multiple {RNA} Secondary Structures", pages = "70", author = "Jason T. L. Wang and Bruce A. Shapiro and Dennis Shasha and Kaizhong Zhang and Chia-Yo Chang", crossref = "simoudis.ea:proceedings-second:96", } @InProceedings{wang.ea:automated-active:97, title = "Automated Discovery of Active Motifs in Three Dimensional Molecules", author = "Xiong Wang and Jason T. L. Wang and Dennis Shasha and Bruce Shapiro and Sitaram Dikshitulu and Isidore Rigoutsos and Kaizhong Zhang", pages = "89", crossref = "heckerman.ea:proceedings-third:97", } @Article{wang.ea:combinatorial-pattern:94, author = "J. Tsong-Li Wang and Gung-Wei Chirn and T. G. Marr and B. Shapiro and D. Shasha and K. Zhang", title = "Combinatorial Pattern Discovery for Scientific Data: Some Preliminary Results", journal = "SIGMOD Record (ACM Special Interest Group on Management of Data)", volume = "23", number = "2", pages = "115--125", month = jun, year = "1994", ISSN = "0163-5808", affiliation = "Dept. of Comput. and Inf. Sci., New Jersey Inst. of Technol., Newark, NJ, USA", classification = "C7330 (Biology and medicine); C1250 (Pattern recognition); C1180 (Optimisation techniques); C1160 (Combinatorial mathematics)", keywords = "Combinatorial pattern discovery; Scientific data; Natural entities; Distance metric; Protein databases; String edit distance; Common externally observable properties; Structural description; Variable-length don't cares; String matching algorithms; Discovery heuristics; Optimization heuristics; Protein classification; Data mining", thesaurus = "Biology computing; Combinatorial mathematics; Natural sciences computing; Optimisation; Pattern recognition; Proteins", xxcrossref = "Anonymous:1994:ASI", } @Article{wang.ea:discovering-active:94, author = "J. T. L. Wang and T. G. Marr and D. Shasha and B. A. Shapiro and G.-W. Chirn", journal = "Nucleic Acids Research", title = "Discovering Active Motifs in Sets of Related Protein Sequences and Using Them for Classification", year = "1994", abstract-url = "http://hertz.njit.edu/~jason/nar94.html", URL = "http://hertz.njit.edu/~jason/nar94.html", keywords = "Data mining, combinatorial pattern discovery, proteins, biochemisty", month = aug, number = "14", pages = "2769--2775", volume = "22", } @InProceedings{wang.ea:representing-discovered:96, title = "Representing Discovered Patterns Using Attributed Hypergraph", pages = "283", author = "Yang Wang and Andrew K. C. Wong", crossref = "simoudis.ea:proceedings-second:96", } @InProceedings{wang.ea:schema-semistructured:97, title = "Schema Discovery for Semistructured Data", author = "Ke Wang and Huiqing Liu", pages = "271", crossref = "heckerman.ea:proceedings-third:97", } @InProceedings{wang.ea:selecting-features:97, title = "Selecting Features by Vertical Compactness of Data", author = "Ke Wang and Suman Sundaresh", pages = "275", crossref = "heckerman.ea:proceedings-third:97", } @Article{weiss.ea:optimized-rule:93, author = "Sholom M. Weiss and Nitin Indurkhya", title = "Optimized Rule Induction", journal = "IEEE Expert", year = "1993", pages = "61--69", month = dec, keywords = "Swap-1, decision rules, comparison.", annote = "Discusses the Swap-1 algorithm for learning decision rules. It is tested on 4 real world datasets - Nettalk, Heart, DNA and Rheum. Comparison with published info on NN, Linear Discriminents and desision trees applied to same problems. Possible extension through using a GA? 33 references.", } @Article{weiss.ea:rule-based-machine:95, author = "S. M. Weiss and N. Indurkhya", year = "1995", title = "Rule-based Machine Learning Methods for Functional Prediction", journal = "JAIR", pages = "383--403", abstract = "We describe a machine learning method for predicting the value of a real-valued function, given the values of multiple input variables. The method induces solutions from samples in the form of ordered disjunctive normal form (DNF) decision rules. A central objective of the method and representation is the induction of compact, easily interpretable solutions. This rule-based decision model can be extended to search efficiently for similar cases prior to approximating function values. Experimental results on real-world data demonstrate that the new techniques are competitive with existing machine learning and statistical methods and can sometimes yield superior regression performance", URL = "http://www.cs.washington.edu/research/jair/table-of-contents-vol3.html", } @InProceedings{wiederhold.ea:acquisition:86, author = "Gio C. M. Wiederhold and Michael G. Walker and Robert L. Blum and Stephen M. Downs", title = "Acquisition of Knowledge from Data", booktitle = "{ACM SIGART} International Symposium on Methodologies for Intelligent Systems", pages = "74--84", address = "Knoxville, Tennessee", year = "1986", } @InProceedings{wiese:bi-directional-ilp:96, author = "M. Wiese", title = "A Bi-directional {ILP} Algorithm", booktitle = "Proceedings of the MLnet Familiarization Workshop on Data Mining with Inductive Logic Programing", pages = "61--72", year = "1996", } @InProceedings{wirth.ea:detecting-early:96, title = "Detecting Early Indicator Cars in an Automotive Database: {A} Multi-Strategy Approach", pages = "76", author = "Ruediger Wirth and Thomas P. Reinartz", crossref = "simoudis.ea:proceedings-second:96", } @Article{wolff:computing-as:95, author = "J. G. Wolff", address = "Univ Coll N Wales, Sch Electr Engn \& Comp Syst, Dean St, Bangor Ll57 1Ut, Gwynedd, Wales", title = "Computing as compression - an overview of the sp theory and system", journal = "New Generation Computing", year = "1995", volume = "13", issue = "2", pages = "187--214", abstract = "This article is an overview of a programme of research based on the conjecture that all kinds of computing and formal reasoning may usefully be understood as information compression by pattern matching, unification and metrics- guided search. The research aims to develop this idea into a theory of computing to integrate and simplify diverse concepts in the field. The research also aims to develop a 'new generation' computing system, based on the theory, to integrate and simplify diverse kinds of computing and to achieve more flexibility and 'intelligence' than conventional computers. Software simulations of the proposed new system provide a concrete expression of the developing theory and a test-bed for the ideas. The background to the research is briefly reviewed including evidence that information compression is a significant element in biological information processing systems. Concepts of information and redundancy are described as a basis for describing how information compression may be achieved by the comparison or matching of patterns, the merging or unification of patterns which are the same, together with metrics-guided search (e.g., 'hill climbing', 'beam search') to maximise compression for a given computational effort. The main elements of the SP theory and of the proposed SP system are described with a summary of developments to date. Some of the kinds of computing which be interpreted as information compression are briefly reviewed. These include: the 'low level' workings of conventional computers; information retrieval, pattern recognition and de-referencing of identifiers; unsupervised inductive learning (grammatical inference, data mining, automatic organisation of software and of knowledge bases); the execution of mathematical or computing functions; deductive and probabilistic inference; parsing and natural language processing; planning and problem solving. Areas of uncertainty where further work is needed are indicated at appropriate points throughout the article.", keywords = "KOLMOGOROV, COMPLEXITY, MODEL, INFORMATION COMPRESSION, THEORY OF COMPUTING, LEARNING, INFORMATION RETRIEVAL, PATTERN RECOGNITION, DEDUCTION, ABDUCTION", } @InProceedings{wrobel.ea:extensibility-systems:96, author = "Stefan Wrobel and Dietrich Wettschereck and Edgar Sommer and Werner Emde", title = "Extensibility in data mining systems", booktitle = "Proc. 2nd International Conference On Knowledge Discovery and Data Mining", editor = "Evangelos Simoudis and Jia Wei Han and Usama Fayyad", publisher = "AAAI Press", address = "Menlo Park, CA, USA", month = aug, year = "1996", pages = "214--219", URL = "ftp://ftp.gmd.de/ml-archive/GMD/papers/ML75.ps.gz", } @InProceedings{wrobel.ea:ilp-description:95, author = "Stefan Wrobel and Saso Dzeroski", title = "The {ILP} description learning problem: Towards a general model-level definition of data mining in {ILP}", booktitle = "Proc. Fachgruppentreffen Maschinelles Lernen (FGML-95)", editor = "K. Morik and J. Herrmann", publisher = "Univ. Dortmund", address = "44221 Dortmund", note = "Research Report 580", URL = "ftp://ftp.gmd.de/ml-archive/GMD/papers/ML68.ps.gz", year = "1995", } @InProceedings{wrobel.ea:user-interactivity:96, author = "Stefan Wrobel and Dietrich Wettschereck and A. Inkeri Verkamo and Arno Siebes and Heikki Mannila and Fred Kwakkel and Willi Kl{\"o}sgen", title = "User Interactivity in Very Large Scale Data Mining", booktitle = "Proc. FGML-96 (Annual Meeting of the GI Special Interest Group Machine Learning)", editor = "W. Dilger and M. Schlosser and J. Zeidler and A. Ittner", month = aug, year = "1996", pages = "125--130", publisher = "TU Chemnitz-Zwickau", address = "09111 Chemnitz", note = "Computer Science Technical Report No. CSR-96-06.", URL = "ftp://ftp.gmd.de/ml-archive/GMD/papers/ML74.ps.gz", } @InProceedings{wrobel:extensibility-systems:96, title = "Extensibility in Data Mining Systems", pages = "214", author = "Stefan Wrobel", crossref = "simoudis.ea:proceedings-second:96", } @Article{wu.ea:graphical-user-interface:96, author = "X. Wu and N. Cercone", address = "Hiroshima Univ, Fac Engn, Dept Elect Engn, Ai Architecture Lab, 1-4-1 Kagamiyama, Higashihiroshima 739, Japan Univ Regina, Regina, Sk S4S 0A2, Canada", title = "A graphical user-interface for knowledge discovery in databases", journal = "Engineering Applications Of Artificial Intelligence", year = "1996", volume = "9", issue = "6", pages = "691--700", abstract = "This paper describes a graphical user-interface for database-oriented knowledge discovery systems, DBLEARN, which has been developed for extracting knowledge rules from relational databases. The interface, designed using a query- by-example approach, provides a graphical means of specifying knowledge-discovery tasks. The interface supplies a graphical browsing facility to help users to perceive the nature of the target database structure. In order to guide users' task specification, a cooperative, menu-based guidance facility has been integrated into the interface. The interface also supplies a graphical interactive adjusting facility for helping users to refine the task specification to improve the quality of learned knowledge rules. Copyright (C) 1996 Elsevier Science Ltd", keywords = "LANGUAGE, EXAMPLE, graphical user-interfaces, knowledge discovery systems, database mining, database query processing, AI applications, visualisation", } @InCollection{wu.ea:integration-heuristic:91, crossref = "piatetsky-shapiro.ea:knowledge-discovery:91", editor = "Gregory Piatetsky-Shapiro and William J. Frawley", booktitle = "Knowledge Discovery in Databases", publisher = "AAAI Press / The MIT Press", address = "Menlo Park, California", edition = "1st", year = "1991", author = "Q. Wu and P. Suetens and A. Oosterlinck", title = "Integration of Heuristic and Bayesian Approaches in a Pattern-Classification System", pages = "249--260", } @TechReport{wuethrich:knowledge-discovery:, author = "B. Wuethrich", URL = "http://www.cs.ust.hk/faculty/beat/bio.html", title = "Knowledge Discovery in Databases", abstract = "[FROM DRAFT - anp] This is a draft of a manuscript of a postgraduate course taught at the Hong Kong University of Science and Technology in Spring 94. The course gives an introduction into the young and fascinating field of knowledge discovery in databases. The manuscript is suited for beginners who can leave out the more advanced sections, as well as people who would like to do research in this area. This manuscript is partly incomplete.

Table of Contents [edited - Andy]

1. Introduction 2. Rule Languages 3. Uncertainty 4. Time 5. Learning Propostional Rules and Decision Trees 6. Learning Datalog Rules 7. Learning Probabilistic Knowledge

Dr. Beat Wuethrich The Hong Kong University of Science and Technology CS Dept (room 3512) Clear Water Bay Kowloon, Hong Kong email: beat(at)cs.ust.hk", annote = "The link above has report divided into sections, the Full report in one file is also available at ftp://ftp.cs.ust.hk/pub/techreport/95/tr95-04.ps.gz", } @Article{wuthrich:probabilistic-bases:95, author = "B. Wuthrich", address = "Hong Kong Univ Sci \& Technol, Kowloon, Hong Kong", title = "Probabilistic knowledge bases", journal = "Ieee Trans. On Knowledge And Data Engineering", year = "1995", volume = "7", issue = "5", pages = "691--698", abstract = "We define a new fixpoint semantics for rule-based reasoning in the presence of weighted information. The semantics is illustrated on a real-world application requiring such reasoning. Optimizations and approximations of the semantics are shown so as to make the semantics amenable to very large scale real-world applications, We finally prove that the semantics is probabilistic and reduces to the usual fixpoint semantics of stratified Datalog if all information is certain. We implemented various knowledge discovery systems which automatically generate such probabilistic decision rules. In collaboration with a bank in Hong Kong we use one such system to forecast currency exchange rates.", keywords = "LOGIC, AXIOMATIC PROBABILITY THEORY, DATA MINING, INCOMPLETE INFORMATION, KNOWLEDGE DISCOVERY IN DATABASES, QUERY OPTIMIZATION AND APPROXIMATION, STRATIFIED DATALOG", } @InProceedings{xia:integrated-call:97, title = "Knowledge Discovery in Integrated Call Centers: {A} Framework for Effective Customer-Driven Marketing", author = "Paul Xia", pages = "279", crossref = "heckerman.ea:proceedings-third:97", } @Article{yasdi:2nd-international:94, author = "R. Yasdi", address = "Hsch Bremerhaven, Karlstadt 8, D-27568 Bremerhaven, Germany", title = "2nd international workshop on rough sets and knowledge discovery - banff, canada, 10-15 october 1993", journal = "Ai Comm.", year = "1994", volume = "7", issue = "2", pages = "128--129", } @Article{yasdi:learning-classification:91, author = "Ramin Yasdi", title = "Learning Classification Rules from Database in the Context of Knowledge Acquisition and Representation", journal = "IEEE Transactions on Knowledge and Data Engineering", year = "1991", volume = "3", number = "3", pages = "293--306", month = sep, } @InProceedings{yoda.ea:computing-optimized:97, title = "Computing Optimized Rectilinear Regions for Association Rules", author = "Kunikazu Yoda and Takeshi Fukuda and Yasuhiko Morimoto and Shinichi Morishita and Takeshi Tokuyama", pages = "96", crossref = "heckerman.ea:proceedings-third:97", } @TechReport{yoon.ea:framework-evolution:, URL = "ftp://isse.gmu.edu/pub/techrep/by_index/ISSE-TR-93-109.ps.Z", title = "A Framework for Knowledge Discovery and Evolution in Databases (78{K})", author = "Jong P. Yoon and Larry Kerschberg", note = "George Mason U, ISSE. July 03, 1994.", } @Article{yoon.ea:framework-evolution:93, author = "J. P. Yoon and L. Kerschberg", address = "George Mason Univ, Sch Informat Technol \& Engn, Ctr Artificial Intelligence, Fairfax, Va, 22030", title = "A framework for knowledge discovery and evolution in databases", journal = "Ieee Trans. On Knowledge And Data Engineering", year = "1993", month = dec, volume = "5", issue = "6", pages = "973--979", abstract = "Although knowledge discovery is increasingly important in databases, discovered knowledge is not always useful to users. It is mainly because the discovered knowledge does not fit the user's interests, or it may be redundant or inconsistent with a priori knowledge. Knowledge discovery in databases depends critically on how well a database is characterized and how consistently the existing and discovered knowledge is evolved. This paper describes a novel concept for knowledge discovery and evolution in databases. The key issues of this work include: using a database query to discover new rules; using not only positive examples (answer to a query) but also negative examples to discover new rules; harmonizing existing rules with the new rules. The main contribution of this paper is the development of a new tool for 1) characterizing the exceptions in databases and 2) evolving knowledge as a database evolves.", keywords = "ACTIVE DATABASE EVOLUTION, DATABASE MINING, EXPERTISE TRANSFER, KNOWLEDGE DISCOVERY, KNOWLEDGE REFINEMENT", } @TechReport{zaki.ea:evaluation-sampling:96, author = "Mohammed Javeed Zaki and Srinivasan Parthasarathy and Wei Li and Mitsunori Ogihara", title = "Evaluation of Sampling for Data Mining of Association Rules", institution = "University of Rochester, Computer Science Department", number = "TR 617", month = may, year = "1996", keywords = "data mining; association rules; random sampling; Chernoff bounds", URL = "ftp://ftp.cs.rochester.edu/pub/papers/systems/96.tr617.Sampling_for_data_mining_of_association_rules.ps.gz", abstract = "Data mining is an emerging research area, whose goal is to extract significant patterns or interesting rules from large databases. High-level inference from large volumes of routine business data can provide valuable information to businesses, such as customer buying patterns, shelving criterion in supermarkets, and stock trends. However, many algorithms proposed for data mining of association rules make repeated passes over the database to determine the commonly occurring {\em itemsets} (or set of items). For large databases, the I/O overhead in scanning the database can be extremely high. .pp In this paper we show that random sampling of transactions in the database is an effective method for finding association rules. Sampling can speed up the mining process by more than an order of magnitude by reducing I/O costs and drastically shrinking the number of transactions to be considered. We may also be able to make the sampled database resident in main-memory. Furthermore, we show that sampling can accurately represent the data patterns in the database with high confidence. We experimentally evaluate the effectiveness of sampling on three databases.", } @InProceedings{zaki.ea:new-algorithms:97, title = "New Algorithms for Fast Discovery of Association Rules", author = "M. J. Zaki and S. Parthasarathy and M. Ogihara and W. Li", pages = "283", crossref = "heckerman.ea:proceedings-third:97", } @InProceedings{zaki.ea:parallel-association:96a, key_modifier = "a", author = "M. J. Zaki and M. Ogihara and S. Parthasarathy and W. Li", title = "Parallel Data Mining for Association Rules on Shared-memory Multi-processors", booktitle = "CD-ROM Proceedings of Supercomputing'96", publisher = "IEEE", address = "Pittsburgh, PA", month = nov, year = "1996", } @TechReport{zaki.ea:parallel-association:96b, key_modifier = "b", author = "Mohammed Javeed Zaki and Mitsunori Ogihara and Srinivasan Parthasarathy and Wei Li", title = "Parallel Data Mining for Association Rules on Shared-Memory Multiprocessors", institution = "University of Rochester, Computer Science Department", number = "TR 618", month = may, pages = "22", year = "1996", keywords = "data mining; association rules; load balancing; hash tree balancing; hashing; shared-memory multiprocessor", URL = "ftp://ftp.cs.rochester.edu/pub/papers/systems/96.tr618.Parallel_data_mining_for_association_rules.ps.gz", abstract = "Data mining is an emerging research area, whose goal is to extract significant patterns or interesting rules from large databases. High-level inference from large volumes of routine business data can provide valuable information to businesses, such as customer buying patterns, shelving criterion in supermarkets, and stock trends. Many algorithms have been proposed for data mining of association rules. However, research so far has mainly focused on sequential algorithms. .pp In this paper we present parallel algorithms for data mining of association rules, and study the degree of parallelism, synchronization, and data locality issues on the SGI Power Challenge shared-memory multi-processor. We further present a set of optimizations for the sequential and parallel algorithms. Experiments show that a significant improvement of performance is achieved using our proposed optimizations. We also achieved good speed-up for the parallel algorithm, but we observe a need for parallel I/O techniques for further performance gains.", } @InProceedings{zamir.ea:fast-intuitive:97, title = "Fast and Intuitive Clustering of Web Documents", author = "Oren Zamir and Oren Etzioni and Omid Madani and Richard M. Karp", pages = "287", crossref = "heckerman.ea:proceedings-third:97", } @Article{zhong.ea:discovering-concept:94, author = "N. Zhong and S. Ohsuga", address = "Univ Tokyo, Adv Sci \& Technol Res Ctr, 4-6-1 Komaba, Meguro Ku, Tokyo 153, Japan", title = "Discovering concept clusters by decomposing databases", journal = "Data \& Knowledge Engineering", year = "1994", volume = "12", issue = "2", pages = "223--244", abstract = "This paper introduces an approach of discovering concept clusters by decomposing databases. This approach is the fundamental one for developing DBI which is one of sub- systems of the GLS discovery system implemented by us. A key feature of this approach is the formation of concept clusters or sub-databases through analysis and deletion of noisy data in decomposing a database. Its development is based on the concept of Simon and Ando's near-complete decomposability that has been most explicitly used in economic theory. In this approach, the process of discovering concept clusters from databases is a process based on incipient hypothesis generation and refinement, and many kinds of learning methods, in which the methods of data-driven and knowledge- driven are included, are cooperatively used in multiple learning phases, so that a more robust, general discovery system can be developed.", keywords = "KNOWLEDGE DISCOVERY, KNOWLEDGE DISCOVERY IN DATABASES, CONCEPTUAL CLUSTERING, NEAR- COMPLETE DECOMPOSABILITY, MULTIPLE LEARNING PHASES, INTEGRATION", } @Article{zhong.ea:hierarchical-model:96, author = "N. Zhong and S. Ohsuga", address = "Yamaguchi Univ, Fac Engn, Dept Comp Sci \& Syst Engn, 2557 Tokiwadai, Ube, Yamaguchi 755, Japan Waseda Univ, Sch Sci \& Engn, Dept Informat \& Comp Sci, Shinjuku Ku, Tokyo 169, Japan", title = "A hierarchical model learning approach for refining and managing concept clusters discovered from databases", journal = "Data \& Knowledge Engineering", year = "1996", volume = "20", issue = "2", pages = "227--252", abstract = "The contents of most databases are ever-changing, and erroneous data can be a significant problem in real-world databases. Therefore, the process of discovering knowledge from databases is a process based on incipient hypothesis generation/evaluation and refinement/management. Although many systems for knowledge discovery in databases have been proposed, most systems have not addressed the capabilities of refining/managing the discovered knowledge. This paper describes a hierarchical model learning approach for refining/managing concept clusters discovered from databases. This approach is the basic one for developing HML (Hierarchical Model Learning), which is one sub- system of our GLS (Global Learning Scheme) discovery system and can be cooperatively used with other sub-systems of GLS such as DBI (Decomposition Based Induction). By means of HML, concept clusters discovered from a database by DBI can be represented as the Multi- Layer Logic formulae with hierarchical models in a knowledge-base and can be easily refined/managed according to data change in a database and/or domain knowledge. HML is based on the model representation of Multi- Layer Logic (MLL). Its key feature is the quantitative evaluation for selecting the best representation of the MLL formulae by using cooperatively a criterion based on information theory and domain knowledge. Experience with a prototype of HML implemented by the knowledge-based system KAUS is discussed.", keywords = "KNOWLEDGE DISCOVERY, INDUCTION, SYSTEMS, RULES, KNOWLEDGE DISCOVERY IN DATABASES, MULTILAYER LOGIC, MACHINE LEARNING, INFORMATION THEORY, HIERARCHICAL MODELING, REFINEMENT, MANAGEMENT", } @InProceedings{zhong.ea:kdd-process:97, title = "{KDD} Process Planning", author = "Ning Zhong and Chunnian Liu and Yoshitsugu Kakemoto and Setsuo Ohsuga", pages = "291", crossref = "heckerman.ea:proceedings-third:97", } @Article{zhong.ea:system-managing:96, author = "N. Zhong and S. Ohsuga", address = "Univ Tokyo, Adv Sci \& Technol Res Ctr, Meguro Ku, 4-6-1 Komaba, Tokyo 153, Japan Waseda Univ, Dept Informat \& Comp Sci, Shinjuku Ku, Tokyo 169, Japan", title = "System for managing and refining structural characteristics discovered from databases", journal = "Knowledge-Based Systems", year = "1996", volume = "9", issue = "4", pages = "267--279", abstract = "Systems that allow automatic knowledge discovery from databases will play an increasingly important role in building/sharing large scale knowledge bases. Although many systems for knowledge discovery in databases have been proposed, few of them have addressed the capabilities of managing and refining the discovered knowledge. In particular, the contents of most databases are ever changing and erroneous data can be a significant problem in real-world databases. Hence, the process of discovering knowledge from databases is a process based on incipient hypothesis generation/evaluation and refinement/management. The paper describes a system named IIBR (Inheritance Inference Based Refinement) for managing and refining structural characteristics discovered from databases. Structural characteristics are a kind of important regularity hidden in databases, and are denoted by regression models for describing three kinds of functional relations: the exact, strong and weak ones. IIBR is one subsystem of the authors' GLS (Global Learning Scheme) discovery system, and can be cooperatively used with other subsystems of GLS such as KOSI (Knowledge Oriented Statistic Inference). By means of IIBR, the structural characteristics discovered by KOSI can be added to a knowledge base as the deductive rules and the sets of data for showing their errors, and can be easily managed and refined according to data change in a database. IIBR is based on inheritance inference and error analysis, as well as the model representation of knowledge, multiple worlds/levels, and metareasoning in the knowledge-based system KAUS. Experience with a prototype of IIBR implemented by KAUS is discussed.", keywords = "KNOWLEDGE DISCOVERY, KNOWLEDGE DISCOVERY IN DATABASES, INHERITANCE INFERENCE, ERROR ANALYSIS, DATA CHANGE, KNOWLEDGE REPRESENTATION", } @Article{ziarko.ea:method-computing:96, author = "W. Ziarko and N. Shan", address = "Univ Regina, Dept Comp Sci, Regina, Sk S4S 0A2, Canada", title = "A method for computing all maximally general rules in attribute-value systems", journal = "Computational Intelligence", year = "1996", volume = "12", issue = "2", pages = "223--234", abstract = "A method for finding all deterministic and maximally general rules for a target classification is explained in detail and illustrated with examples: Maximally general rules are rules with minimal numbers of conditions. The method has been developed within the context of the rough sets model and is based on the concepts of a decision matrix and a decision function. The problem of finding ail the rules is reduced to the problem of computing prime implicants of a group of associated Boolean expressions. The method is particularly applicable to identifying all potentially interesting deterministic rules in a knowledge discovery system but can also be used to produce possible rules or nondeterministic rules with decision probabilities, by adapting the method to the definitions of the variable precision rough sets model.", keywords = "KNOWLEDGE DISCOVERY, MACHINE LEARNING, ROUGH SETS, RULES", } @Article{ziarko:introduction-to:95, author = "W. Ziarko", address = "Univ Regina, Dept Comp Sci, Regina, Sk S4S 0A2, Canada", title = "Introduction to the special issue on rough sets and knowledge discovery", journal = "Computational Intelligence", year = "1995", volume = "11", issue = "2", pages = "223--226", } @Article{ziarko:some-privacy:95, author = "W. Ziarko", address = "Univ Regina, Dept Comp Sci, Regina, Sk S4S 0A2, Canada", title = "Some privacy issues in knowledge discovery - oecd personal privacy guidelines - response", journal = "Ieee Expert-Intelligent Systems \& Their Applications", year = "1995", volume = "10", issue = "2", pages = "59--59", keywords = "ethics, ethical", } @InProceedings{zighed.ea:optimal-multiple:97, title = "Optimal Multiple Intervals Discretization of Continuous Attributes for Supervised Learning", author = "D. A. Zighed and R. Rakotomalala and F. Feschet", pages = "295", crossref = "heckerman.ea:proceedings-third:97", } @InProceedings{zupan.ea:dataset-decomposition:97, title = "A Dataset Decomposition Approach to Data Mining and Machine Discovery", author = "Blaz Zupan and Marko Bohanec and Ivan Bratko and Bojan Cestnik", pages = "299", crossref = "heckerman.ea:proceedings-third:97", } @InProceedings{zytkow.ea:automated-pattern:96, title = "Automated Pattern Mining with a Scale Dimension", pages = "158", author = "Jan M. Zytkow and Robert Zembowicz", crossref = "simoudis.ea:proceedings-second:96", } @InCollection{zytkow.ea:interactive-regularities:91, author = "Jan M. Zytkow and John Baker", title = "Interactive mining for regularities in Databases", booktitle = "Knowledge Discovery in Databases", editor = "Gregory Piatetsky-Shapiro and William J. Frawley", publisher = "{AAAI Press}", year = "1991", address = "Menlo Park, California", pages = "31--53", } @Article{zytkow.ea:patterns-at:96, author = "J. Zytkow and R. Zembowicz", title = "Mining patterns at each scale in massive data", journal = "Lecture Notes in Computer Science", volume = "1079", pages = "139--??", year = "1996", ISSN = "0302-9743", } @InProceedings{zytkow:combining-many:87, author = "Jan M. Zytkow", title = "Combining many searches in the {FAHRENHEIT} discovery system", booktitle = "Proceedings of the fourth international workshop on machine learning", year = "1987", address = "San Mateo, California", publisher = "Morgan Kaufmann", pages = "281--287", } @InProceedings{zytkow:concepts-harmful:97, title = "Knowledge = Concepts: {A} Harmful Equation", author = "Jan M. Zytkow", pages = "104", crossref = "heckerman.ea:proceedings-third:97", }