@inproceedings{domingos00unified, author = {Pedro Domingos}, title = {A Unified Bias-Variance Decomposition for Zero-One and Squared Loss}, booktitle = {Proc. National Conference on Artificial Intelligence and Proc. Conference Innovative Applications of Artificial Intelligence}, year = {2000}, isbn = {0-262-51112-6}, pages = {564--569}, publisher = {AAAI Press / The MIT Press}, } @article{turtle95query, author = {Howard Turtle and James Flood}, title = {Query evaluation: strategies and optimizations}, journal = {IP\&M}, volume = {31}, number = {6}, year = {1995}, issn = {0306-4573}, pages = {831--850}, doi = {dx.doi.org/10.1016/0306-4573(95)00020-H}, publisher = {Pergamon Press}, address = {Tarrytown, NY, USA}, } @inproceedings{aberer01pgrid, author = {Karl Aberer}, title = {{P-Grid}: A Self-Organizing Access Structure for {P2P} Information Systems}, booktitle = {Proc. International Conference on Cooperative Information Systems}, year = {2001}, isbn = {3-540-42524-1}, pages = {179--194}, publisher = {Springer}, address = {London, UK}, } @incollection{ callan00distributed, author = "Jamie Callan", title = "Distributed information retrieval", editor = {W. Bruce Croft}, booktitle = {Advances in information retrieval}, pages = {127-150}, publisher = {Kluwer}, year = 2000 } @inproceedings{robertson04simple, author = {Stephen Robertson and Hugo Zaragoza and Michael Taylor}, title = {Simple {BM25} extension to multiple weighted fields}, booktitle = {Proc. CIKM}, year = {2004}, isbn = {1-58113-874-1}, pages = {42--49}, doi = {doi.acm.org/10.1145/1031171.1031181}, } % location = {Washington, D.C., USA}, % publisher = {ACM Press}, % address = {New York, NY, USA}, @article{smeulders00contentbased, author = {Arnold W. M. Smeulders and Marcel Worring and Simone Santini and Amarnath Gupta and Ramesh Jain}, title = {Content-Based Image Retrieval at the End of the Early Years}, journal = {IEEE Trans. Pattern Anal. Mach. Intell.}, volume = {22}, number = {12}, year = {2000}, issn = {0162-8828}, pages = {1349--1380}, doi = {dx.doi.org/10.1109/34.895972}, publisher = {IEEE Computer Society}, address = {Washington, DC, USA}, } @inproceedings{tombros98advantages, author = {Anastasios Tombros and Mark Sanderson}, title = {Advantages of query biased summaries in information retrieval}, booktitle = {Proc. SIGIR}, year = {1998}, isbn = {1-58113-015-5}, pages = {2--10}, location = {Melbourne, Australia}, doi = {doi.acm.org/10.1145/290941.290947}, publisher = {ACM Press}, address = {New York, NY, USA}, } @techreport{oard96survey, author = {Douglas W. Oard and Bonnie J. Dorr}, title = {A survey of multilingual text retrieval}, year = {1996}, institution = {Institute for Advanced Computer Studies, University of Maryland}, number = {UMIACS-TR-96-19}, address = {College Park, MD, USA}, } @book{ingwersen05turn, author = {Peter Ingwersen and Kalervo J\"{a}rvelin}, title = {The Turn: Integration of Information Seeking and Retrieval in Context}, year = {2005}, isbn = {140203850X}, publisher = {Springer}, address = {Secaucus, NJ, USA}, } @article{lewis96natural, author = {David D. Lewis and Karen Sp\"{a}rck Jones}, title = {Natural language processing for information retrieval}, journal = {CACM}, volume = {39}, number = {1}, year = {1996}, issn = {0001-0782}, pages = {92--101}, doi = {doi.acm.org/10.1145/234173.234210}, publisher = {ACM Press}, address = {New York, NY, USA}, } @book{meadow99text, author = {Charles T. Meadow and Donald H. Kraft and Bert R. Boyce}, title = {Text Information Retrieval Systems}, year = {1999}, isbn = {0124874053}, publisher = {Academic Press}, address = {Orlando, FL, USA}, } @Article{ creecy92trading, author = {Robert H. Creecy and Brij M. Masand and Stephen J. Smith and David L. Waltz}, title = {Trading {MIPS} and memory for knowledge engineering}, publisher = {ACM Press}, journal = {CACM}, volume = {35}, number = {8}, year = {1992}, issn = {0001-0782}, pages = {48--64}, doi = {doi.acm.org/10.1145/135226.135228}, address = {New York, NY}, } @PhDThesis{ sornil01parallel, author = {Ohm Sornil}, title = {Parallel Inverted Index for Large-Scale, Dynamic Digital Libraries}, year = {2001}, school = {Virginia Tech}, url = {scholar.lib.vt.edu/theses/available/etd-02062001-114915/} , } @InCollection{ harman92inverted, author = {Donna Harman and Ricardo Baeza-Yates and Edward Fox and W. Lee}, title = {Inverted files}, year = {1992}, pages = {28--43}, crossref = {frakes92information} } @Book{ garciamolina99database, author = {Hector Garcia-Molina and Jennifer Widom and Jeffrey D. Ullman}, title = {Database System Implementation}, year = {1999}, isbn = {0130402648}, publisher = {Prentice Hall}, address = {Upper Saddle River, NJ, USA}, } @InProceedings{ altingovde07largescale, author = {Ismail Seng{\"o}r Alting{\"o}vde and Rifat Ozcan and Huseyin Cagdas Ocalan and Fazli Can and {\"O}zg{\"u}r Ulusoy}, title = {Large-scale cluster-based retrieval experiments on {T}urkish texts}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, year = {2007}, pages = {891-892}, ee = {doi.acm.org/10.1145/1277741.1277961}, bibsource = {DBLP, http://dblp.uni-trier.de}, } @InProceedings{ eyheramendy03naive, author = {Susana Eyheramendy and David Lewis and David Madigan}, title = {On the {Naive {Bayes}} Model for Text Categorization}, booktitle = {International Workshop on Artificial Intelligence and Statistics}, year = 2003, publisher = {Society for Artificial Intelligence and Statistics}, } @InProceedings{ cacheda03optimization, author = {Fidel Cacheda and Victor Carneiro and Carmen Guerrero and {\'A}ngel Vi{\~n}a}, title = {Optimization of Restricted Searches in Web Directories Using Hybrid Data Structures}, booktitle = {Proc. ECIR}, year = {2003}, pages = {436-451}, ee = {link.springer.de/link/service/series/0558/bibs/2633/26330436.htm} , bibsource = {DBLP, http://dblp.uni-trier.de}, } @InProceedings{ pelleg99accelerating, author = {Dan Pelleg and Andrew Moore}, title = {Accelerating exact k-means algorithms with geometric reasoning}, booktitle = {Proc. KDD}, publisher = {ACM Press}, year = {1999}, isbn = {1-58113-143-7}, pages = {277--281}, location = {San Diego, CA}, doi = {doi.acm.org/10.1145/312129.312248}, address = {New York, NY}, } @InProceedings{ bradley98refining, author = {Paul S. Bradley and Usama M. Fayyad}, title = {Refining Initial Points for {K}-Means Clustering}, booktitle = {Proc. ICML}, year = {1998}, pages = {91-99}, bibsource = {DBLP, http://dblp.uni-trier.de}, } @Article{ can04efficiency, author = {Fazli Can and Ismail Seng\"{o}r Alting{\"o}vde and Engin Demir}, title = {Efficiency and effectiveness of query processing in cluster-based retrieval}, journal = {Information Systems}, volume = {29}, number = {8}, year = {2004}, issn = {0306-4379}, pages = {697--717}, doi = {dx.doi.org/10.1016/S0306-4379(03)00062-0}, publisher = {Elsevier Science}, address = {Oxford, UK, UK}, } @InProceedings{ davidson03speeding, author = {Ian Davidson and Ashwin Satyanarayana}, title = {Speeding up k-means Clustering by Bootstrap Averaging}, booktitle = {ICDM 2003 Workshop on Clustering Large Data Sets}, year = {2003}, } @InProceedings{ rosenzvi04authortopic, author = {Michal Rosen-Zvi and Thomas Griffiths and Mark Steyvers and Padhraic Smyth}, title = {The author-topic model for authors and documents}, booktitle = {Proc. UAI}, year = {2004}, isbn = {0-9749039-0-6}, pages = {487--494}, } % publisher = {AUAI Press}, % address = {Arlington, Virginia, United States}, % location = {Banff}, @InProceedings{ trotman06xmlir, author = {Andrew Trotman and Nils Pharo and Miro Lehtonen}, title = {{XML}-{IR} Users and Use Cases}, booktitle = {Proc. INEX}, year = {2006}, pages = {400-412}, ee = {dx.doi.org/10.1007/978-3-540-73888-6_38}, bibsource = {DBLP, http://dblp.uni-trier.de}, } @InProceedings{ betsi06user, author = {Stamatina Betsi and Mounia Lalmas and Anastasios Tombros and Theodora Tsikrika}, title = {User expectations from {XML} element retrieval}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, year = {2006}, pages = {611-612}, ee = {doi.acm.org/10.1145/1148170.1148280}, bibsource = {DBLP, http://dblp.uni-trier.de}, } @InProceedings{ woodley06nlpx, author = {Alan Woodley and Shlomo Geva}, title = {{NLPX} at {INEX} 2006}, booktitle = {Proc. INEX}, year = {2006}, pages = {302-311}, ee = {dx.doi.org/10.1007/978-3-540-73888-6_30}, bibsource = {DBLP, http://dblp.uni-trier.de}, } @InProceedings{ yang01thresholding, author = {Yiming Yang}, title = {A study of thresholding strategies for text categorization}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, year = {2001}, isbn = {1-58113-331-6}, pages = {137--145}, location = {New Orleans, LA}, doi = {doi.acm.org/10.1145/383952.383975}, address = {New York, N}, } @InProceedings{ buckley95optimization, author = {Chris Buckley and Gerard Salton}, title = {Optimization of relevance feedback weights}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, year = {1995}, isbn = {0-89791-714-6}, pages = {351--357}, location = {Seattle, Washington, United States}, doi = {doi.acm.org/10.1145/215206.215383}, address = {New York, NY}, } @Article{ ault02information, author = {Thomas Galen Ault and Yiming Yang}, title = {Information Filtering in {TREC-9} and {TDT-3}: {A} Comparative Analysis}, journal = {IR}, volume = {5}, number = {2-3}, year = {2002}, issn = {1386-4564}, pages = {159--187}, publisher = {Kluwer}, address = {Hingham, MA, USA}, } @InProceedings{ yang03marginbased, author = {Yiming Yang and Bryan Kisiel}, title = {Margin-based local regression for adaptive filtering}, booktitle = {Proc. CIKM}, year = {2003}, isbn = {1-58113-723-0}, pages = {191--198}, doi = {doi.acm.org/10.1145/956863.956902}, } % location = {New Orleans, LA, USA}, % publisher = {ACM Press}, % address = {New York, NY}, @InProceedings{ moschitti03optimal, author = {Alessandro Moschitti}, title = {A Study on Optimal Parameter Tuning for {R}occhio Text Classifier}, booktitle = {Proc. ECIR}, year = {2003}, pages = {420-435}, ee = {link.springer.de/link/service/series/0558/bibs/2633/26330420.htm} , bibsource = {DBLP, http://dblp.uni-trier.de}, } @TechReport{ bennett00assessing, author = "Paul N. Bennett", title = "Assessing the calibration of naive {Bayes}' posterior estimates", number = {CMU-CS-00-155}, institution = {School of Computer Science, Carnegie Mellon University}, year = "2000", } @InProceedings{ turpin07fast, author = {Andrew Turpin and Yohannes Tsegay and David Hawking and Hugh E. Williams}, title = {Fast generation of result snippets in web search}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, year = {2007}, pages = {127-134}, ee = {doi.acm.org/10.1145/1277741.1277766}, bibsource = {DBLP, http://dblp.uni-trier.de}, } @InProceedings{ ntoulas07pruning, author = {Alexandros Ntoulas and Junghoo Cho}, title = {Pruning policies for two-tiered inverted index with correctness guarantee}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, year = {2007}, pages = {191-198}, ee = {doi.acm.org/10.1145/1277741.1277776}, bibsource = {DBLP, http://dblp.uni-trier.de}, } @InProceedings{ zhang07performance, author = {Jiangong Zhang and Xiaohui Long and Torsten Suel}, title = {Performance of Compressed Inverted List Caching in Search Engines}, booktitle = {Proc. CIKM}, year = {2007}, } % publisher = {ACM Press}, @InProceedings{ silvestri04assigning, author = {Fabrizio Silvestri and Raffaele Perego and Salvatore Orlando}, title = {Assigning document identifiers to enhance compressibility of Web Search Engines indexes}, booktitle = {Proc. ACM Symposium on Applied Computing}, year = {2004}, pages = {600-605}, ee = {doi.acm.org/10.1145/967900.968024}, bibsource = {DBLP, http://dblp.uni-trier.de}, } @InProceedings{ blandford02index, author = {Dan Blandford and Guy Blelloch}, title = {Index Compression through Document Reordering}, booktitle = {Proc. Data Compression Conference}, year = {2002}, pages = {342}, publisher = {IEEE Computer Society}, address = {Washington, DC, USA}, } @Article{ blanco06tsp, author = {Roi Blanco and Alvaro Barreiro}, title = {{TSP} and cluster-based solutions to the reassignment of document identifiers}, journal = {IR}, volume = {9}, number = {4}, year = {2006}, pages = {499-517}, ee = {dx.doi.org/10.1007/s10791-006-6614-y}, bibsource = {DBLP, http://dblp.uni-trier.de}, } @InProceedings{ silvestri07sorting, author = {Fabrizio Silvestri}, title = {Sorting Out the Document Identifier Assignment Problem}, booktitle = {Proc. ECIR}, year = {2007}, pages = {101-112}, ee = {dx.doi.org/10.1007/978-3-540-71496-5_12}, bibsource = {DBLP, http://dblp.uni-trier.de}, } @InProceedings{ moffat96exploiting, author = {Alistair Moffat and Lang Stuiver}, title = {Exploiting clustering in inverted file compression}, booktitle = {Proc. Conference on Data Compression}, year = {1996}, isbn = {0-8186-7358-3}, pages = {82--91}, publisher = {IEEE Computer Society}, address = {Washington, DC, USA}, } @InProceedings{ moschitti04complex, author = {Alessandro Moschitti and Roberto Basili}, title = {Complex Linguistic Features for Text Classification: {A} Comprehensive Study}, booktitle = {Proc. ECIR}, year = {2004}, pages = {181-196}, bibsource = {DBLP, http://dblp.uni-trier.de}, } @InProceedings{ rennie03tackling, author = {Jason D. Rennie and Lawrence Shih and Jaime Teevan and David R. Karger}, title = {Tackling the Poor Assumptions of Naive {Bayes} Text Classifiers}, booktitle = {Proc. ICML}, year = {2003}, pages = {616-623}, bibsource = {DBLP, http://dblp.uni-trier.de}, } @Article{ can90concepts, author = {Fazli Can and Esen A. Ozkarahan}, title = {Concepts and Effectiveness of the Cover-Coefficient-Based Clustering Methodology for Text Databases}, journal = {ACM Trans. Database Syst.}, volume = {15}, number = {4}, year = {1990}, pages = {483-517}, ee = {doi.acm.org/10.1145/99935.99938, db/journals/tods/CanO90.html}, bibsource = {DBLP, http://dblp.uni-trier.de}, } @Book{ anderberg73cluster, author = {Michael R. Anderberg}, title = {Cluster analysis for applications}, address = {New York}, publisher = {Academic Press}, year = 1973 } % used to have "Probability and Mathematical Statistics, " in the publisher??? @TechReport{ fox91fastinv, author = {Edward A. Fox and Whay C. Lee}, title = {{FAST-INV}: {A} Fast Algorithm for building large inverted files}, year = {1991}, source = {www.ncstrl.org:8900/ncstrl/servlet/search?formname=detail\&id=oai%3Ancstrlh%3Avatech_cs%3Ancstrl.vatech_cs%2F%2FTR-91-10} , institution = {Virginia Polytechnic Institute \& State University}, address = {Blacksburg, VA, USA}, } @InProceedings{ ng01spectral, author = {Andrew Y. Ng and Michael I. Jordan and Yair Weiss}, title = {On Spectral Clustering: {A}nalysis and an algorithm}, booktitle = {Proc. NIPS}, year = {2001}, pages = {849-856}, ee = {www-2.cs.cmu.edu/Groups/NIPS/NIPS2001/papers/psgz/AA35.ps.gz} , bibsource = {DBLP, http://dblp.uni-trier.de}, } @InProceedings{ kannan00clusterings, author = {Ravi Kannan and Santosh Vempala and Adrian Vetta}, title = {On clusterings -- {G}ood, bad and spectral}, booktitle = {Proc. Symposium on Foundations of Computer Science}, year = {2000}, isbn = {0-7695-0850-2}, pages = {367--377}, publisher = {IEEE Computer Society}, address = {Washington, DC, USA}, } @Article{ boley98principal, author = {Daniel Boley}, title = {Principal Direction Divisive Partitioning}, journal = {Data Mining and Knowledge Discovery}, volume = {2}, number = {4}, year = {1998}, issn = {1384-5810}, pages = {325--344}, doi = {dx.doi.org/10.1023/A:1009740529316}, publisher = {Kluwer}, address = {Hingham, MA, USA}, } @InProceedings{ tishby00data, author = {Naftali Tishby and Noam Slonim}, title = {Data Clustering by {M}arkovian Relaxation and the Information Bottleneck Method}, booktitle = {Proc. NIPS}, year = {2000}, pages = {640-646}, bibsource = {DBLP, http://dblp.uni-trier.de}, } @InProceedings{ zha01bipartite, author = {Hongyuan Zha and Xiaofeng He and Chris H. Q. Ding and Ming Gu and Horst D. Simon}, title = {Bipartite Graph Partitioning and Data Clustering}, booktitle = {Proc. CIKM}, year = {2001}, pages = {25-32}, bibsource = {DBLP, http://dblp.uni-trier.de}, } % publisher = {ACM Press}, @InProceedings{ dhillon01coclustering, author = {Inderjit S. Dhillon}, title = {Co-clustering documents and words using bipartite spectral graph partitioning}, booktitle = {Proc. KDD}, year = {2001}, pages = {269-274}, ee = {portal.acm.org/citation.cfm?id=502512.502550}, bibsource = {DBLP, http://dblp.uni-trier.de}, } @InProceedings{ hearst93subtopic, author = {Marti A. Hearst and Christian Plaunt}, title = {Subtopic structuring for full-length document access}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, year = {1993}, isbn = {0-89791-605-0}, pages = {59--68}, location = {Pittsburgh, Pennsylvania, United States}, doi = {doi.acm.org/10.1145/160688.160695}, address = {New York, NY}, } @InCollection{ berkhin06survey, author = {Pavel Berkhin}, title = {A survey of clustering data mining techniques}, booktitle = {Grouping Multidimensional Data: {R}ecent Advances in Clustering}, editor = {Jacob Kogan and Charles Nicholas and Marc Teboulle}, year = {2006}, pages = {25--71}, publisher = {Springer}, } @InProceedings{ mihajlovic05score, author = {Vojkan Mihajlovi\'{c} and Henk Ernst Blok and Djoerd Hiemstra and Peter M. G. Apers}, title = {Score region algebra: {B}uilding a transparent {XML-R} database}, booktitle = {Proc. CIKM}, year = {2005}, isbn = {1-59593-140-6}, pages = {12--19}, doi = {doi.acm.org/10.1145/1099554.1099560}, } % publisher = {ACM Press}, % location = {Bremen}, % address = {New York, NY}, @TechReport{ chiaramella96model, author = {Yves Chiaramella and Philippe Mulhem and Franck Fourel}, title = {A Model for Multimedia Information Retrieval}, year = {1996}, institution = {University of Glasgow}, number = {4-96}, } @InProceedings{ forman04pitfall, author = {George Forman}, title = {A pitfall and solution in multi-class feature selection for text classification}, booktitle = {Proc. ICML}, year = {2004}, ee = {doi.acm.org/10.1145/1015330.1015356}, bibsource = {DBLP, http://dblp.uni-trier.de}, } @Article{ tsochantaridis05large, author = {Ioannis Tsochantaridis and Thorsten Joachims and Thomas Hofmann and Yasemin Altun}, title = {Large Margin Methods for Structured and Interdependent Output Variables}, journal = {JMLR}, volume = {6}, year = {2005}, pages = {1453-1484}, ee = {www.jmlr.org/papers/v6/tsochantaridis05a.html}, bibsource = {DBLP, http://dblp.uni-trier.de}, } @InProceedings{ riezler07statistical, author = {Riezler, Stefan and Vasserman, Alexander and Tsochantaridis, Ioannis and Mittal, Vibhu and Liu, Yi}, title = {Statistical Machine Translation for Query Expansion in Answer Retrieval}, booktitle = {Proc. ACL}, month = {June}, year = {2007}, address = {Prague, Czech Republic}, publisher = {Association for Computational Linguistics}, pages = {464--471}, url = {www.aclweb.org/anthology/P/P07/P07-1059}, } @Book{ cohen95empirical, title = {Empirical methods for artificial intelligence}, address = {Cambridge, MA, USA}, author = {Paul R. Cohen}, publisher = {MIT Press}, year = {1995}, } @InProceedings{ chucarroll06semantic, author = {Jennifer Chu-Carroll and John Prager and Krzysztof Czuba and David Ferrucci and Pablo Duboue}, title = {Semantic search via {XML} fragments: {A} high-precision approach to {IR}}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, year = {2006}, isbn = {1-59593-369-7}, pages = {445--452}, location = {Seattle, Washington, USA}, doi = {doi.acm.org/10.1145/1148170.1148247}, address = {New York, NY}, } @InProceedings{ arvola05generalized, author = {Paavo Arvola and Marko Junkkari and Jaana Kek{\"a}l{\"a}inen}, title = {Generalized contextualization method for {XML} information retrieval}, booktitle = {Proc. CIKM}, year = {2005}, pages = {20-27}, ee = {doi.acm.org/10.1145/1099554.1099561}, bibsource = {DBLP, http://dblp.uni-trier.de}, } % publisher = {ACM Press}, @InProceedings{ sigurbjornsson04mixture, author = {B{\"o}rkur Sigurbj{\"o}rnsson and Jaap Kamps and Maarten de Rijke}, title = {Mixture Models, Overlap, and Structural Hints in {XML} Element Retrieval}, booktitle = {Proc. INEX}, year = {2004}, pages = {196-210}, ee = {dx.doi.org/10.1007/11424550_16}, bibsource = {DBLP, http://dblp.uni-trier.de}, } @InProceedings{ vittaut06machine, author = {Jean-No{\"e}l Vittaut and Patrick Gallinari}, title = {Machine Learning Ranking for Structured Information Retrieval}, booktitle = {Proc. ECIR}, year = {2006}, pages = {338-349}, ee = {dx.doi.org/10.1007/11735106_30}, bibsource = {DBLP, http://dblp.uni-trier.de}, } @Article{ lalmas07evaluating, author = {Mounia Lalmas and Anastasios Tombros}, title = {Evaluating {XML} retrieval effectiveness at {INEX}}, publisher = {ACM Press}, journal = {SIGIR Forum}, volume = {41}, number = {1}, year = {2007}, issn = {0163-5840}, pages = {40--57}, doi = {doi.acm.org/10.1145/1273221.1273225}, address = {New York, NY}, } @Article{ chaudhuri06probabilistic, author = {Surajit Chaudhuri and Gautam Das and Vagelis Hristidis and Gerhard Weikum}, title = {Probabilistic information retrieval approach for ranking of database query results}, publisher = {ACM Press}, journal = {ACM Transactions on Database Systems}, volume = {31}, number = {3}, year = {2006}, issn = {0362-5915}, pages = {1134--1168}, doi = {doi.acm.org/10.1145/1166074.1166085}, address = {New York, NY}, } @InProceedings{ cohen98integration, author = {William W. Cohen}, title = {Integration of Heterogeneous Databases Without Common Domains Using Queries Based on Textual Similarity}, booktitle = {Proc. SIGMOD}, publisher = {ACM Press}, year = {1998}, isbn = {0-89791-955-5}, pages = {201-212}, bibsource = {DBLP, http://dblp.uni-trier.de}, } % editor = {Laura M. Haas and Ashutosh Tiwary}, @Article{ navarro97proximal, author = {Gonzalo Navarro and Ricardo Baeza-Yates}, title = {Proximal nodes: {A} model to query document databases by content and structure}, publisher = {ACM Press}, journal = {TOIS}, volume = {15}, number = {4}, year = {1997}, issn = {1046-8188}, pages = {400--435}, doi = {doi.acm.org/10.1145/263479.263482}, address = {New York, NY}, } @Article{ fuhr97probabilistic, author = {Norbert Fuhr and Thomas R{\"o}lleke}, title = {A probabilistic relational algebra for the integration of information retrieval and database systems}, publisher = {ACM Press}, journal = {TOIS}, volume = {15}, number = {1}, year = {1997}, issn = {1046-8188}, pages = {32--66}, doi = {doi.acm.org/10.1145/239041.239045}, address = {New York, NY}, } @Article{ ameryahia05report, author = {Sihem Amer-Yahia and Pat Case and Thomas R{\"o}lleke and Jayavel Shanmugasundaram and Gerhard Weikum}, title = {Report on the {DB/IR} panel at {SIGMOD} 2005}, publisher = {ACM Press}, journal = {SIGMOD Record}, volume = {34}, number = {4}, year = {2005}, issn = {0163-5808}, pages = {71--74}, doi = {doi.acm.org/10.1145/1107499.1107514}, address = {New York, NY}, } @article{theobald08efficient, author = {Martin Theobald and Holger Bast and Debapriyo Majumdar and Ralf Schenkel and Gerhard Weikum}, title = {Top{X}: {E}fficient and versatile top-{\it k} query processing for semistructured data}, journal = {VLDB Journal}, volume = {17}, number = {1}, year = {2008}, pages = {81-115}, ee = {dx.doi.org/10.1007/s00778-007-0072-z}, } @Article{ ameryahia06xquery, author = {Sihem Amer-Yahia and Chavdar Botev and Jochen D{\"o}rre and Jayavel Shanmugasundaram}, title = {{XQuery} Full-Text extensions explained}, journal = {IBM Systems Journal}, volume = 45, number = 2, pages = {335--352}, year = 2006, } @InProceedings{ zavrel00information, author = "Jakub Zavrel and Peter Berck and Willem Lavrijssen", title = "Information Extraction by Text Classification: {C}orpus Mining for Features.", booktitle = {Workshop Information Extraction Meets Corpus Linguistics}, month = "May 30th", year = "2000", address = "Athens, Greece", note = "Held in conjunction with LREC-2000", url = "www.cnts.ua.ac.be/Publications/2000/ZBL00", } @Article{ zobel95efficient, author = {Justin Zobel and Alistair Moffat and Ross Wilkinson and Ron Sacks-Davis}, title = {Efficient retrieval of partial documents}, journal = {IP\&M}, volume = {31}, number = {3}, year = {1995}, issn = {0306-4573}, pages = {361--377}, doi = {dx.doi.org/10.1016/0306-4573(94)00052-5}, publisher = {Pergamon Press}, address = {Tarrytown, NY}, } @InProceedings{ salton93passage, author = {Gerard Salton and James Allan and Chris Buckley}, title = {Approaches to passage retrieval in full text information systems}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, year = {1993}, isbn = {0-89791-605-0}, pages = {49--58}, location = {Pittsburgh, Pennsylvania, United States}, doi = {doi.acm.org/10.1145/160688.160693}, address = {New York, NY}, } @InProceedings{ kaszkiel97passage, author = {Marcin Kaszkiel and Justin Zobel}, title = {Passage retrieval revisited}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, year = {1997}, isbn = {0-89791-836-3}, pages = {178--185}, location = {Philadelphia, Pennsylvania, United States}, doi = {doi.acm.org/10.1145/258525.258561}, address = {New York, NY}, } @Article{ hearst97texttiling, author = {Marti A. Hearst}, title = {{TextTiling}: {S}egmenting Text into Multi-paragraph Subtopic Passages.}, journal = {Computational Linguistics}, volume = {23}, number = {1}, year = {1997}, pages = {33-64}, bibsource = {DBLP, http://dblp.uni-trier.de}, } @InProceedings{ tan07using, author = {Songbo Tan and Xueqi Cheng}, title = {Using hypothesis margin to boost centroid text classifier}, booktitle = {Proc. ACM Symposium on Applied Computing}, publisher = {ACM Press}, year = {2007}, isbn = {1-59593-480-4}, pages = {398--403}, location = {Seoul, Korea}, doi = {doi.acm.org/10.1145/1244002.1244096}, address = {New York, NY}, } @InProceedings{ han00centroidbased, author = {Eui-Hong Han and George Karypis}, title = {Centroid-Based Document Classification: {A}nalysis and Experimental Results}, booktitle = {Proc. PKDD}, year = {2000}, pages = {424-431}, ee = {link.springer.de/link/service/series/0558/bibs/1910/19100424.htm} , bibsource = {DBLP, http://dblp.uni-trier.de}, } @InProceedings{ joachims97probabilistic, author = {Thorsten Joachims}, title = {A Probabilistic Analysis of the {R}occhio Algorithm with TFIDF for Text Categorization}, booktitle = {Proc. ICML}, year = {1997}, isbn = {1-55860-486-3}, pages = {143--151}, publisher = {Morgan Kaufmann}, address = {San Francisco, CA}, } @InProceedings{ allan98online, author = {James Allan and Ron Papka and Victor Lavrenko}, title = {On-line new event detection and tracking}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, year = {1998}, isbn = {1-58113-015-5}, pages = {37--45}, location = {Melbourne, Australia}, doi = {doi.acm.org/10.1145/290941.290954}, address = {New York, NY}, } @InProceedings{ trotman06passage, author = {Andrew Trotman and Shlomo Geva}, title = {Passage Retrieval and Other {XML}-Retrieval Tasks}, booktitle = {SIGIR 2006 Workshop on {XML} Element Retrieval Methodology}, pages = {43--50}, year = 2006, } @TechReport{ somogyi90melbourne, author = {Zoltan Somogyi}, title = {The {M}elbourne {U}niversity bibliography system}, year = {1990}, institution = {Melbourne University}, address = {Parkville, Victoria, Australia}, number = {90/3}, } @Article{ lesk88grab, author = {Michael Lesk}, title = {Grab -- {I}nverted indexes with low storage overhead}, year = 1988, journal = {Computing Systems}, volume = {1}, pages = {207--220}, } @InProceedings{ joachims06training, author = {Thorsten Joachims}, title = {Training linear {SVMs} in linear time}, booktitle = {Proc. KDD}, publisher = {ACM Press}, year = {2006}, isbn = {1-59593-339-5}, pages = {217--226}, location = {Philadelphia, PA, USA}, doi = {doi.acm.org/10.1145/1150402.1150429}, address = {New York, NY}, } @Article{ perkins03grafting, author = {Simon Perkins and Kevin Lacker and James Theiler}, title = {Grafting: {F}ast, incremental feature selection by gradient descent in function space}, journal = {JMLR}, volume = {3}, year = {2003}, issn = {1533-7928}, pages = {1333--1356}, publisher = {MIT Press}, address = {Cambridge, MA}, } @InProceedings{ fraenkel85novel, author = {Aviezri S. Fraenkel and Shmuel T. Klein}, title = {Novel Compression of sparse Bit-Strings -- Preliminary Report}, booktitle = {Combinatorial Algorithms on Words, NATO ASI Series Vol F12}, publisher = {Springer}, address = {Berlin}, year = 1985, pages = { 169--183}, } @InProceedings{ moffat92parameterised, author = {Alistair Moffat and Justin Zobel}, title = {Parameterised compression for sparse bitmaps}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, year = {1992}, isbn = {0-89791-523-2}, pages = {274--285}, location = {Copenhagen, Denmark}, doi = {doi.acm.org/10.1145/133160.133210}, address = {New York, NY}, } @Book{ pirolli07information, title = {Information Foraging Theory: {A}daptive Interaction With Information}, author = {Peter L. T. Pirolli}, publisher = {Oxford University Press}, year = 2007, } @Book{ langville, title = {Google's {PageRank} and Beyond: {T}he Science of Search Engine Rankings}, author = {Amy Langville and Carl Meyer}, publisher = {Princeton University Press}, year = 2006, } @Article{ fraley98how, author = {Chris Fraley and Adrian E. Raftery}, title = {How Many Clusters? {W}hich Clustering Method? {A}nswers Via Model-Based Cluster Analysis}, journal = {Computer Journal}, volume = {41}, number = {8}, year = {1998}, pages = {578-588}, bibsource = {DBLP, http://dblp.uni-trier.de}, } @InProceedings{ iwayama95clusterbased, author = {Makoto Iwayama and Takenobu Tokunaga}, title = {Cluster-Based Text Categorization: {A} Comparison of Category Search Strategies}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, year = {1995}, isbn = {0-89791-714-6}, pages = {273-280}, ee = {db/conf/sigir/IwayamaT95.html}, bibsource = {DBLP, http://dblp.uni-trier.de}, } @Article{ bartell98optimizing, author = {Brian T. Bartell and Garrison W. Cottrell and Richard K. Belew}, title = {Optimizing similarity using multi-query relevance feedback}, journal = {JASIS}, volume = {49}, number = {8}, year = {1998}, issn = {0002-8231}, pages = {742--761}, publisher = {John Wiley \& Sons}, address = {New York, NY}, } @Article{ dhillon01concept, author = {Inderjit S. Dhillon and Dharmendra S. Modha}, title = {Concept decompositions for large sparse text data using clustering}, journal = {Machine Learning}, volume = {42}, number = {1/2}, year = {2001}, issn = {0885-6125}, pages = {143--175}, doi = {dx.doi.org/10.1023/A:1007612920971}, publisher = {Kluwer}, address = {Hingham, MA}, } @InProceedings{ grabs02xml, title = {Generating Vector Spaces On-the-fly for Flexible {XML} Retrieval}, author = {Torsten Grabs and Hans-J{\"o}rg Schek}, booktitle = {{XML} and Information Retrieval Workshop at SIGIR 2002}, year = {2002}, } @Article{ schlieder02querying, author = {Torsten Schlieder and Holger Meuss}, title = {Querying and ranking {XML} documents}, journal = {JASIST}, volume = {53}, number = {6}, year = {2002}, issn = {1532-2882}, pages = {489--503}, doi = {dx.doi.org/10.1002/asi.10060}, publisher = {John Wiley \& Sons}, address = {New York, NY}, } @InProceedings{ tannier05xml, author = {Xavier Tannier and Shlomo Geva}, title = {{XML} Retrieval with a Natural Language Interface}, booktitle = {Proc. SPIRE}, year = {2005}, pages = {29-40}, ee = {dx.doi.org/10.1007/11575832_4}, bibsource = {DBLP, http://dblp.uni-trier.de}, } @InProceedings{ zwol06bricks, author = {Roelof {van~Zwol} and Jeroen Baas and Herre van Oostendorp and Frans Wiering}, title = {Bricks: {T}he Building Blocks to Tackle Query Formulation in Structured Document Retrieval}, booktitle = {Proc. ECIR}, year = {2006}, pages = {314-325}, ee = {dx.doi.org/10.1007/11735106_28}, bibsource = {DBLP, http://dblp.uni-trier.de}, } @Article{ fuhr04xirql, author = {Norbert Fuhr and Kai Gro{\ss}johann}, title = {{XIRQL}: {A}n {XML} query language based on information retrieval concepts}, journal = {TOIS}, volume = {22}, number = {2}, year = {2004}, pages = {313-356}, url = {doi.acm.org/10.1145/984321.984326}, bibsource = {DBLP, http://dblp.uni-trier.de}, } @InProceedings{ stein03cluster, author = {Benno Stein and Sven Meyer zu Eissen and Frank Wi{\ss}brock}, title = {On Cluster Validity and the Information Need of Users}, booktitle = {Proc. Artificial Intelligence and Applications}, year = 2003, } @InProceedings{ stein04topic, author = {Benno Stein and Sven Meyer zu Eissen}, title = {Topic Identification: {F}ramework and Application}, booktitle = {Proc. International Conference on Knowledge Management}, year = 2004, } @PhDThesis{ bartell94optimizing, author = {Brian Theodore Bartell}, title = {Optimizing ranking functions: {A} connectionist approach to adaptive information retrieval}, year = {1994}, order_no = {UMI Order No. GAX94-14751}, school = {University of California at San Diego}, address = {La Jolla, CA}, } @Book{ grossman04information, title = {Information Retrieval: {A}lgorithms and Heuristics}, author = {David A. Grossman and Ophir Frieder}, edition = {2nd}, publisher = {Springer}, year = 2004, } @InProceedings{ lu07cisr, author = {Wei Lu and Stephen E. Robertson and Andrew MacFarlane}, title = {{CISR} at {INEX} 2006}, booktitle = {Proc. INEX}, year = {2007}, pages = {57-63}, crossref = {fuhr07comparative}, } @InProceedings{ kamps04length, author = {Jaap Kamps and Maarten de Rijke and B{\"o}rkur Sigurbj{\"o}rnsson}, title = {Length normalization in {XML} retrieval}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, year = {2004}, isbn = {1-58113-881-4}, pages = {80--87}, location = {Sheffield, United Kingdom}, doi = {doi.acm.org/10.1145/1008992.1009009}, address = {New York, NY}, } @Article{ list05tijah, author = {Johan List and Vojkan Mihajlovic and Georgina Ram{\'\i}rez and Arjen P. Vries and Djoerd Hiemstra and Henk Ernst Blok}, title = {{TIJAH}: {E}mbracing {IR} Methods in {XML} Databases}, journal = {IR}, volume = {8}, number = {4}, year = {2005}, issn = {1386-4564}, pages = {547--570}, doi = {dx.doi.org/10.1007/s10791-005-0747-2}, publisher = {Kluwer}, address = {Hingham, MA, USA}, } @Article{ larson05fusion, author = {Ray R. Larson}, title = {A Fusion Approach to {XML} Structured Document Retrieval}, journal = {IR}, volume = {8}, number = {4}, year = {2005}, pages = {601-629}, doi = {dx.doi.org/10.1007/s10791-005-0749-0}, bibsource = {DBLP, http://dblp.uni-trier.de}, } @Article{ kazai06extended, author = {Gabriella Kazai and Mounia Lalmas}, title = {{eXtended} cumulated gain measures for the evaluation of content-oriented {XML} retrieval}, journal = {TOIS}, volume = {24}, number = {4}, year = {2006}, pages = {503-542}, doi = {doi.acm.org/10.1145/1185883}, bibsource = {DBLP, http://dblp.uni-trier.de}, } @InProceedings{ lalmas07inex, author = {Mounia Lalmas and Gabriella Kazai and Jaap Kamps and Jovan Pehcevski and Benjamin Piwowarski and Stephen E. Robertson}, title = {{INEX} 2006 evaluation measures}, crossref = {fuhr07comparative}, year = {2007}, pages = {20--34}, } @Proceedings{ trotman07focused, editor = {Andrew Trotman and Shlomo Geva and Jaap Kamps}, title = {SIGIR Workshop on Focused Retrieval}, year = {2007}, publisher = {University of Otago}, location = {Dunedin, New Zealand}, } @Book{ fuhr07comparative, title = "{Comparative Evaluation of {XML} Information Retrieval Systems, 5th International Workshop of the Initiative for the Evaluation of {XML} Retrieval, {INEX} 2006}", year = 2007, editor = {Norbert Fuhr and Mounia Lalmas and Andrew Trotman}, address = {Heidelberg}, publisher = {Springer}, } % number = 4518, % series = {Lecture Notes in Computer Science/Lecture Notes in % Artificial Intelligence (LNCS/LNAI)}, @InProceedings{ okeefe04simplest, author = {Richard A. O'Keefe and Andrew Trotman}, title = {The simplest query language that could possibly work}, booktitle = {Proc. INEX}, year = {2004}, pages = {167--174}, crossref = {fuhr04advances}, } @Article{ ameryahia06xml, author = {Sihem Amer-Yahia and Mounia Lalmas}, title = {{{XML}} search: {L}anguages, {INEX} and scoring}, publisher = {ACM Press}, journal = {SIGMOD Record}, volume = {35}, number = {4}, year = {2006}, issn = {0163-5808}, pages = {16--23}, doi = {doi.acm.org/10.1145/1228268.1228271}, address = {New York, NY}, } @InProceedings{ theobald05topx, author = {Martin Theobald and Ralf Schenkel and Gerhard Weikum}, title = {An efficient and versatile query engine for {TopX} search}, booktitle = {Proc. VLDB}, year = {2005}, isbn = {1-59593-154-6}, pages = {625--636}, location = {Trondheim}, publisher = {VLDB Endowment}, } @Proceedings{ fuhr03inex2002, title = {{INitiative for the Evaluation of {XML}} Retrieval ({INEX}). Proc. First {INEX} Workshop}, editor = {Norbert Fuhr and Norbert G{\"o}vert and Gabriella Kazai and Mounia Lalmas}, address = {Sophia Antipolis, France}, publisher = {ERCIM}, booktitle = {Proc. INEX 2002}, entrydate = 20030226, month = {March}, year = 2003, } % series = {ERCIM Workshop Proceedings}, @Proceedings{ fuhr03inex, title = {{INEX} 2003 Workshop}, year = {2003}, editor = {Norbert Fuhr and Saadia Malik and Mounia Lalmas}, url = {inex.is.informatik.uni-duisburg.de:2003/proceedings.pdf} , } @Proceedings{ fuhr05advances, editor = {Norbert Fuhr and Mounia Lalmas and Saadia Malik and Gabriella Kazai}, title = {Advances in {XML} Information Retrieval and Evaluation, 4th International Workshop of the Initiative for the Evaluation of {XML} Retrieval, {INEX} 2005}, booktitle = {Proc. INEX}, publisher = {Springer}, year = {2006}, isbn = {3-540-34962-6}, bibsource = {DBLP, http://dblp.uni-trier.de}, } % series = {Lecture Notes in Computer Science}, % volume = {3977}, @Article{ kamps06articulating, author = {Jaap Kamps and Maarten Marx and Maarten de Rijke and B{\"o}rkur Sigurbj{\"o}rnsson}, title = {Articulating information needs in {XML} query languages}, publisher = {ACM Press}, journal = {TOIS}, volume = {24}, number = {4}, year = {2006}, issn = {1046-8188}, pages = {407--436}, doi = {doi.acm.org/10.1145/1185877.1185879}, address = {New York, NY}, } @InProceedings{ mccallum98improving, author = {Andrew McCallum and Ronald Rosenfeld and Tom M. Mitchell and Andrew Y. Ng}, title = {Improving Text Classification by Shrinkage in a Hierarchy of Classes}, booktitle = {Proc. ICML}, year = {1998}, isbn = {1-55860-556-8}, pages = {359--367}, publisher = {Morgan Kaufmann}, address = {San Francisco, CA}, } @Book{ fuhr04advances, editor = {Norbert Fuhr and Mounia Lalmas and Saadia Malik and Zolt{\'a}n Szl{\'a}vik}, title = {Advances in {XML} Information Retrieval, Third International Workshop of the Initiative for the Evaluation of {XML} Retrieval, INEX 2004}, booktitle = {Proc. INEX}, publisher = {Springer}, year = {2005}, isbn = {3-540-26166-4}, bibsource = {DBLP, http://dblp.uni-trier.de}, } % series = {Lecture Notes in Computer Science}, % volume = {3493}, @InProceedings{ trotman04narrowed, author = {Andrew Trotman and B{\"o}rkur Sigurbj{\"o}rnsson}, title = {Narrowed {E}xtended {XP}ath {I} ({NEXI})}, booktitle = {Proc. INEX}, year = {2004}, pages = {16--40}, doi = {dx.doi.org/10.1007/11424550\_2}, crossref = {fuhr04advances}, } @InProceedings{ fuhr07advances, author = {Norbert Fuhr and Mounia Lalmas}, title = {Advances in {XML} Retrieval: {T}he {INEX} Initiative}, booktitle = {International Workshop on Research Issues in Digital Libraries}, year = 2007, } @TechReport{singhal95length, author = {Amit Singhal and Gerard Salton and Chris Buckley}, title = {Length Normalization in Degraded Text Collections}, year = {1995}, institution = {Cornell University}, address = {Ithaca, NY}, } @InProceedings{singhal96length, title = {Length Normalization in Degraded Text Collections}, author = {Amit Singhal and Gerard Salton and Chris Buckley}, booktitle = {Proc. SDAIR}, pages = {149--162}, year = 1996, } @InProceedings{bast05spectral, author = {Holger Bast and Debapriyo Majumdar}, title = {Why spectral retrieval works}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, year = {2005}, isbn = {1-59593-034-5}, pages = {11--18}, location = {Salvador, Brazil}, doi = {doi.acm.org/10.1145/1076034.1076040}, address = {New York, NY}, } @Article{ moffat95insitu, author = {Alistair Moffat and Timothy A. H. Bell}, title = {In situ generation of compressed inverted files}, journal = {JASIS}, volume = {46}, number = {7}, year = {1995}, issn = {0002-8231}, pages = {537--550}, publisher = {John Wiley \& Sons}, address = {New York, NY}, } @InProceedings{ zukowski06superscalar, author = {Marcin Zukowski and Sandor Heman and Niels Nes and Peter Boncz}, title = {Super-Scalar {RAM-CPU} Cache Compression}, booktitle = {Proc. International Conference on Data Engineering}, year = {2006}, isbn = {0-7695-2570-9}, pages = {59}, doi = {dx.doi.org/10.1109/ICDE.2006.150}, publisher = {IEEE Computer Society}, address = {Washington, DC, USA}, } @InProceedings{ dom02information, author = {Byron E. Dom}, title = {An Information-Theoretic External Cluster-Validity Measure}, booktitle = {Proc. UAI}, month = {August}, year = {2002}, } @InProceedings{ blanco07boosting, title = {Boosting Static Pruning of Inverted Files}, author = {Roi Blanco and Alvaro Barreiro}, year = 2007, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, } @Article{ weiss05concept, author = "Stanis{\l}aw Osi{\'n}ski and Dawid Weiss", title = {A Concept-Driven Algorithm for Clustering Search Results}, journal = {IEEE Intelligent Systems}, number = "3", volume = "20", pages = "48--54", year = "2005", } @inproceedings{ arthur06worstcase, author = {David Arthur and Sergei Vassilvitskii}, title = {How slow is the {\it k}-means method?}, booktitle = {Proc. ACM Symposium on Computational Geometry}, year = {2006}, pages = {144-153}, ee = {doi.acm.org/10.1145/1137856.1137880}, bibsource = {DBLP, http://dblp.uni-trier.de} } @Book{ fellbaum98wordnet, title = {WordNet -- An Electronic Lexical Database}, author = {Christiane D. Fellbaum}, publisher = {MIT Press}, year = {1998}, } @Article{ fowlkes83clusterings, author = "Edward B. Fowlkes and Colin L. Mallows", title = "A Method for Comparing Two Hierarchical Clusterings", journal = {Journal of the American Statistical Association}, volume = "78", year = "1983", number = "383", pages = "553-569", url = {www.jstor.org/view/01621459/di985957/98p0926l/0}, } @InProceedings{ kleinberg02impossibility, title = {An Impossibility Theorem for Clustering}, author = {Jon M. Kleinberg}, year = {2002}, booktitle = {Proc. NIPS}, } @InProceedings{ meila05clusterings, author = "Marina Meil\u{a}", title = "Comparing clusterings -- {A}n axiomatic view", booktitle = {Proc. ICML}, year = "2005", address = "Bonn", pages = "", } @Article{ savaresi04pddp, author = {Sergio M. Savaresi and Daniel Boley}, title = {A comparative analysis on the bisecting {K}-means and the {PDDP} clustering algorithms.}, journal = {Intelligent Data Analysis}, volume = {8}, number = {4}, year = {2004}, pages = {345--362}, } @Article{ castro04likelihood, author = {R. M. Castro and M. J. Coates and R. D. Nowak}, title = {Likelihood Based Hierarchical Clustering}, journal = {IEEE Transactions in Signal Processing}, volume = 52, number = 8, year = {2004}, pages = {2308--2321}, } @InProceedings{ kamvar02interpreting, author = {Sepandar D. Kamvar and Dan Klein and Christopher D. Manning}, title = {Interpreting and Extending Classical Agglomerative Clustering Algorithms using a Model-Based approach}, booktitle = {Proc. ICML}, year = {2002}, isbn = {1-55860-873-7}, pages = {283--290}, publisher = {Morgan Kaufmann}, address = {San Francisco, CA}, } @Book{ mclachlan96em, title = {The {EM} Algorithm and Extensions}, author = {Geoffrey J. McLachlan and Thiriyambakam Krishnan}, year = 1996, publisher = {John Wiley \& Sons}, } @Article{ blei03latent, author = {David M. Blei and Andrew Y. Ng and Michael I. Jordan}, title = {Latent {D}irichlet allocation}, journal = {JMLR}, volume = {3}, year = {2003}, issn = {1533-7928}, pages = {993--1022}, publisher = {MIT Press}, address = {Cambridge, MA, USA}, } @Book{ rice06statistics, author = {John A. Rice}, title = {Mathematical Statistics and Data Analysis}, publisher = {Duxbury Press}, year = 2006, } @Book{ sheldon06probability, author = {Sheldon Ross}, title = {A First Course in Probability}, publisher = {Pearson Prentice Hall}, year = 2006, } @InProceedings{ buttcher06document, author = {Stefan B{\"u}ttcher and Charles L. A. Clarke}, title = {A document-centric approach to static index pruning in text retrieval systems}, booktitle = {Proc. CIKM}, year = {2006}, isbn = {1-59593-433-2}, pages = {182--189}, doi = {doi.acm.org/10.1145/1183614.1183644}, } % publisher = {ACM Press}, % location = {Arlington, VA}, % address = {New York, NY}, @Article{ trotman03compressing, author = {Andrew Trotman}, title = {Compressing Inverted Files}, journal = {IR}, volume = {6}, number = {1}, year = {2003}, issn = {1386-4564}, pages = {5--19}, doi = {dx.doi.org/10.1023/A:1022949613039}, publisher = {Kluwer}, address = {Hingham, MA}, } @Book{ cover91elements, author = {Thomas M. Cover and Joy A. Thomas}, title = {Elements of Information Theory}, publisher = {Wiley}, year = {1991}, address = {New York}, } @Article{ barroso03web, author = {Luiz Andr{\'e} Barroso and Jeffrey Dean and Urs H{\"o}lzle}, title = {Web Search for a Planet: {T}he {G}oogle Cluster Architecture}, journal = {IEEE Micro}, volume = {23}, number = {2}, year = {2003}, issn = {0272-1732}, pages = {22--28}, doi = {dx.doi.org/10.1109/MM.2003.1196112}, publisher = {IEEE Computer Society Press}, address = {Los Alamitos, CA}, } @Book{ comtet74advanced, author = {Louis Comtet}, publisher = {Reidel}, title = {Advanced Combinatorics}, year = {1974}, } @InProceedings{ ball65data, author = {G. H. Ball}, title = {Data analysis in the social sciences: {W}hat about the details?}, booktitle = {Proc. Fall Joint Computer Conference}, publisher = {Spartan Books}, pages = {533--560}, year = 1965, } @Book{ burnham02model, author = {Kenneth P. Burnham and David Anderson }, citeulike-article-id={157697}, isbn = {0387953647}, publisher = {Springer}, title = {Model Selection and Multi-Model Inference}, year = {2002}, } @Article{ hartigan79kmeans, author = {J. A. Hartigan and M. A. Wong}, title = {A {K}-Means Clustering Algorithm}, journal = {Applied Statistics}, volume = 28, pages = {100--108}, entrydate = 20030618, key = {Hartigan/Wong:79}, year = 1979, } @InProceedings{ basu04active, title = {Active Semi-Supervision for Pairwise Constrained Clustering}, address = {Lake Buena Vista, FL}, author = {Sugato Basu and Arindam Banerjee and Raymond J. Mooney}, booktitle = {Proc. SIAM International Conference on Data Mining}, pages = {333--344}, year = {2004}, } @InProceedings{ huang06text, author = {Yifen Huang and Tom M. Mitchell}, title = {Text clustering with extended user feedback}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, year = {2006}, isbn = {1-59593-369-7}, pages = {413--420}, location = {Seattle, WA}, doi = {doi.acm.org/10.1145/1148170.1148242}, address = {New York, NY}, } @InProceedings{ crouch88cluster, author = {Carolyn J. Crouch}, title = {A cluster-based approach to thesaurus construction}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, year = {1988}, isbn = {2-7061-0309-4}, pages = {309--320}, location = {Grenoble}, doi = {doi.acm.org/10.1145/62437.62467}, address = {New York, NY}, } @InProceedings{ schuetze95information, author = {Hinrich Sch{\"u}tze and Jan O. Pedersen}, title = {Information Retrieval Based on Word Senses}, year = 1995, booktitle = {Proc. SDAIR}, address = {Las Vegas, NV}, pages = {161--175}, } @Book{ witten05data, title = {Data Mining: {P}ractical Machine Learning Tools and Techniques}, author = {Ian H. Witten and Eibe Frank}, edition = {2nd}, howpublished = {Paperback}, month = {June}, publisher = {Morgan Kaufmann}, year = {2005}, isbn = {0120884070}, citeulike-article-id={340715}, priority = {0}, keywords = {weka data mining da }, } % series = {Morgan Kaufmann Series in Data Management Sys}, @InCollection{ cheeseman96bayesian, author = {Peter Cheeseman and John Stutz}, title = {Bayesian Classification ({AutoClass}): {T}heory and Results.}, booktitle = {Advances in Knowledge Discovery and Data Mining}, year = {1996}, pages = {153-180}, publisher = {MIT Press}, bibsource = {DBLP, http://dblp.uni-trier.de}, } @Unpublished{ mccallum96bow, author = "Andrew Kachites McCallum", title = {Bow: {A} toolkit for statistical language modeling, text retrieval, classification and clustering}, note = "\url{www.cs.cmu.edu/{\urltilde}mccallum/bow}", year = 1996, } @InProceedings{ picca06nonlinear, author = {Davide Picca and Beno\^{i}t Curdy and Fran\c{c}ois Bavaud}, title = {Non-linear correspondence analysis in text retrieval: {A} kernel view}, booktitle = {Proc. JADT}, year = {2006}, } @Book{ bishop06pattern, author = {Christopher M. Bishop}, title = { Pattern Recognition and Machine Learning}, publisher = {Springer}, year = 2006, } @InProceedings{ ghamrawi05collective, author = {Nadia Ghamrawi and Andrew McCallum}, title = {Collective multi-label classification}, booktitle = {Proc. CIKM}, publisher = {ACM Press}, year = {2005}, isbn = {1-59593-140-6}, pages = {195--200}, location = {Bremen}, doi = {doi.acm.org/10.1145/1099554.1099591}, address = {New York, NY}, } @Article{ geman92neural, author = {Stuart Geman and Elie Bienenstock and Ren{\'e} Doursat}, title = {Neural networks and the bias/variance dilemma}, journal = {Neural Computation}, volume = {4}, number = {1}, year = {1992}, issn = {0899-7667}, pages = {1--58}, publisher = {MIT Press}, address = {Cambridge, MA}, } @InProceedings{ anagnostopoulos06effective, author = {Aris Anagnostopoulos and Andrei Z. Broder and Kunal Punera}, title = {Effective and efficient classification on a search-engine model}, booktitle = {Proc. CIKM}, publisher = {ACM Press}, year = {2006}, isbn = {1-59593-433-2}, pages = {208--217}, location = {Arlington, VA}, doi = {doi.acm.org/10.1145/1183614.1183648}, address = {New York, NY}, } @Article{ rahm01survey, author = "Erhard Rahm and Philip A. Bernstein", title = "A survey of approaches to automatic schema matching", journal = {VLDB Journal}, volume = "10", number = "4", month = "????", pages = "334--350", year = "2001", url = "citeseer.ist.psu.edu/rahm01survey.html", } @InProceedings{ hatzivassiloglou00linguistic, author = {Vasileios Hatzivassiloglou and Luis Gravano and Ankineedu Maganti}, title = {An investigation of linguistic features and clustering algorithms for topical document clustering}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, year = {2000}, isbn = {1-58113-226-3}, pages = {224--231}, location = {Athens}, doi = {doi.acm.org/10.1145/345508.345582}, address = {New York, NY}, } @Article{ lance67general, author = "G. N. Lance and W. T. Williams", title = "A general theory of classificatory sorting strategies 1. {Hierarchical} systems", journal = {Computer Journal}, volume = "9", number = "4", pages = "373--380", month = feb, year = "1967", coden = "CMPJA6", issn = "0010-4620", } @InProceedings{ sahoo06incremental, author = {Nachiketa Sahoo and Jamie Callan and Ramayya Krishnan and George Duncan and Rema Padman}, title = {Incremental hierarchical clustering of text documents}, booktitle = {Proc. CIKM}, year = {2006}, isbn = {1-59593-433-2}, pages = {357--366}, doi = {doi.acm.org/10.1145/1183614.1183667}, } % publisher = {ACM Press}, % location = {Arlington, VA}, % address = {New York, NY}, @InProceedings{ larsen99fast, author = {Bjornar Larsen and Chinatsu Aone}, title = {Fast and effective text mining using linear-time document clustering}, booktitle = {Proc. KDD}, publisher = {ACM Press}, year = {1999}, isbn = {1-58113-143-7}, pages = {16--22}, location = {San Diego, CA}, doi = {doi.acm.org/10.1145/312129.312186}, address = {New York, NY}, } @InProceedings{ zhao02evaluation, author = {Ying Zhao and George Karypis}, title = {Evaluation of hierarchical clustering algorithms for document datasets}, booktitle = {Proc. CIKM}, publisher = {ACM Press}, year = {2002}, isbn = {1-58113-492-4}, pages = {515--524}, location = {McLean, VA}, doi = {doi.acm.org/10.1145/584792.584877}, address = {New York, NY}, } @InProceedings{ buttcher05indexing, author = {Stefan B{\"u}ttcher and Charles L. A. Clarke}, title = {Indexing time vs. query time: {T}rade-offs in dynamic information retrieval systems}, booktitle = {Proc. CIKM}, publisher = {ACM Press}, year = {2005}, isbn = {1-59593-140-6}, pages = {317--318}, location = {Bremen}, doi = {doi.acm.org/10.1145/1099554.1099645}, address = {New York, NY}, } @InProceedings{ forman06tackling, author = {George Forman}, title = {Tackling concept drift by temporal inductive transfer}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, year = {2006}, isbn = {1-59593-369-7}, pages = {252--259}, location = {Seattle, WA}, doi = {doi.acm.org/10.1145/1148170.1148216}, address = {New York, NY}, } @Article{ brisaboa06lightweight, title = {Lightweight Natural Language Text Compression}, author = {Nieves R. Brisaboa and Antonio Fari{\~n}a and Gonzalo Navarro and Jos{\'e} R. Param{\'a}}, journal = {IR}, year = 2007, volume = 10, number = 1, pages = {1--33}, } @InProceedings{ buttcher05security, author = {Stefan B{\"u}ttcher and Charles L. A. Clarke}, title = {A Security Model for Full-Text File System Search in Multi-User Environments.}, booktitle = {Proc. FAST}, year = {2005}, url = {www.usenix.org/events/fast05/tech/buettcher.html}, } @Book{ heaps78information, author = {Heaps, Harold S.}, year = {1978}, title = {Information Retrieval: {C}omputational and Theoretical Aspects}, publisher = {Academic Press}, address = {New York}, } @Article{ anh06improved, author = {Vo Ngoc Anh and Alistair Moffat}, title = "Improved Word-Aligned Binary Compression for Text Indexing", journal = {IEEE Transactions on Knowledge and Data Engineering}, year = 2006, volume = 18, number = 6, pages = "857-861", } @InProceedings{ buckley94automatic, author = "Chris Buckley and James Allan and Gerard Salton", title = "Automatic Routing and Ad-hoc Retrieval using {SMART}: {TREC} 2", booktitle = {Proc. TREC}, pages = "45--55", year = 1994, } @InProceedings{ schapire98boosting, author = {Robert E. Schapire and Yoram Singer and Amit Singhal}, title = {Boosting and {R}occhio Applied to Text Filtering}, year = 1998, pages = {215--223}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, } @InProceedings{ ittner95text, author = {David J. Ittner and David D. Lewis and David D. Ahn}, title = {Text categorization of low quality images}, booktitle = {Proc. SDAIR}, publisher = {}, editor = {}, year = {1995}, address = {Las Vegas, US}, pages = {301--315}, } @InProceedings{ alonso06gio, author = {Omar Alonso and Sandeepan Banerjee and Mark Drake}, title = {{GIO}: {A} semantic web application using the information grid framework}, booktitle = {Proc. WWW}, publisher = {ACM Press}, year = {2006}, isbn = {1-59593-323-9}, pages = {857--858}, location = {Edinburgh}, doi = {doi.acm.org/10.1145/1135777.1135913}, address = {New York, NY}, } @InProceedings{ toda05search, author = {Hiroyuki Toda and Ryoji Kataoka}, title = {A search result clustering method using informatively named entities}, booktitle = {International Workshop on Web Information and Data Management}, publisher = {ACM Press}, year = {2005}, isbn = {1-59593-194-5}, pages = {81--86}, location = {Bremen}, doi = {doi.acm.org/10.1145/1097047.1097063}, address = {New York, NY}, } @InProceedings{ ogilvie05parameter, author = {Paul Ogilvie and Jamie Callan}, title = {Parameter Estimation for a Simple Hierarchical Generative Model for {XML} Retrieval}, booktitle = {Proc. INEX}, year = {2005}, pages = {211-224}, doi = {dx.doi.org/10.1007/11766278\_16}, bibsource = {DBLP, http://dblp.uni-trier.de}, } @Article{ witten90source, author = {Ian H. Witten and Timothy C. Bell}, title = {Source models for natural language text}, journal = {International Journal Man-Machine Studies}, volume = {32}, number = {5}, year = {1990}, issn = {0020-7373}, pages = {545--579}, publisher = {Academic Press}, address = {London, UK, UK}, } @InProceedings{ kleinberg97two, author = {Jon M. Kleinberg}, title = {Two algorithms for nearest-neighbor search in high dimensions}, booktitle = {Proc. ACM Symposium on Theory of Computing}, publisher = {ACM Press}, year = {1997}, isbn = {0-89791-888-6}, pages = {599--608}, location = {El Paso, TX}, doi = {doi.acm.org/10.1145/258533.258653}, address = {New York, NY}, } @InCollection{anh06structured, title = {Structured Index Organizations for High-Throughput Text Querying}, booktitle = {Proc. SPIRE}, author = {Vo Ngoc Anh and Alistair Moffat}, publisher = {Springer}, pages = {304-315}, year = 2006, } % series = {Lecture Notes in Computer Science}, % volume = 4209, @InProceedings{ koenemann96interaction, author = {J{\"u}rgen Koenemann and Nicholas J. Belkin}, title = {A case for interaction: {A} study of interactive information retrieval behavior and effectiveness}, booktitle = {Proc. SIGCHI}, publisher = {ACM Press}, year = {1996}, isbn = {0-89791-777-4}, pages = {205--212}, location = {Vancouver}, doi = {doi.acm.org/10.1145/238386.238487}, address = {New York, NY}, } @Article{ dieugenio04kappa, author = {Barbara {Di Eugenio} and Michael Glass}, title = {The Kappa Statistic: {A} Second Look.}, journal = {Computational Linguistics}, volume = {30}, number = {1}, year = {2004}, pages = {95-101}, doi = {dx.doi.org/10.1162/089120104773633402}, bibsource = {DBLP, http://dblp.uni-trier.de}, } @InProceedings{singitham04efficiency, author = {Pavan Kumar C. Singitham and Mahathi S. Mahabhashyam and Prabhakar Raghavan}, title = {Efficiency-Quality Tradeoffs for Vector Score Aggregation}, booktitle = {Proc. VLDB}, year = {2004}, pages = {624-635}, url = {www.vldb.org/conf/2004/RS17P1.PDF}, bibsource = {DBLP, http://dblp.uni-trier.de}, } @InProceedings{ buttcher06hybrid, author = {Stefan B{\"u}ttcher and Charles L. A. Clarke and Brad Lushman}, title = {Hybrid index maintenance for growing text collections}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, year = {2006}, isbn = {1-59593-369-7}, pages = {356--363}, location = {Seattle, WA}, doi = {doi.acm.org/10.1145/1148170.1148233}, address = {New York, NY}, } @Article{ heinz02burst, author = {Steffen Heinz and Justin Zobel and Hugh E. Williams}, title = {Burst tries: {A} fast, efficient data structure for string keys}, publisher = {ACM Press}, journal = {TOIS}, volume = {20}, number = {2}, year = {2002}, issn = {1046-8188}, pages = {192--223}, doi = {doi.acm.org/10.1145/506309.506312}, address = {New York, NY}, } @InProceedings{ ribeiro99efficient, author = {Berthier Ribeiro-Neto and Edleno S. Moura and Marden S. Neubert and Nivio Ziviani}, title = {Efficient distributed algorithms to build inverted files}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, year = {1999}, isbn = {1-58113-096-1}, pages = {105--112}, location = {Berkeley, CA}, doi = {doi.acm.org/10.1145/312624.312663}, address = {New York, NY}, } @InProceedings{ melnik01building, author = {Sergey Melnik and Sriram Raghavan and Beverly Yang and Hector Garcia-Molina}, title = {Building a distributed full-text index for the Web}, booktitle = {Proc. WWW}, publisher = {ACM Press}, year = {2001}, isbn = {1-58113-348-0}, pages = {396--406}, location = {Hong Kong}, doi = {doi.acm.org/10.1145/371920.372095}, address = {New York, NY}, } @Article{ lester06efficient, author = {Nicholas Lester and Justin Zobel and Hugh E. Williams}, title = {Efficient online index maintenance for contiguous inverted lists.}, journal = {IP\&M}, volume = {42}, number = {4}, year = {2006}, pages = {916--933}, doi = {dx.doi.org/10.1016/j.ipm.2005.09.005}, bibsource = {DBLP, http://dblp.uni-trier.de}, } @Article{ williams05searchable, author = {Hugh E. Williams and Justin Zobel}, title = {Searchable words on the Web}, journal = {International Journal on Digital Libraries}, volume = {5}, number = {2}, year = {2005}, pages = {99-105}, doi = {dx.doi.org/10.1007/s00799-003-0050-z}, bibsource = {DBLP, http://dblp.uni-trier.de}, } @Article{ heinz03efficient, author = {Steffen Heinz and Justin Zobel}, title = {Efficient single-pass index construction for text databases}, journal = {JASIST}, volume = {54}, number = {8}, year = {2003}, issn = {1532-2882}, pages = {713--729}, doi = {dx.doi.org/10.1002/asi.10268}, publisher = {John Wiley \& Sons}, address = {New York, NY}, } @InProceedings{ lester05fast, author = {Nicholas Lester and Alistair Moffat and Justin Zobel}, title = {Fast on-line index construction by geometric partitioning}, booktitle = {Proc. CIKM}, publisher = {ACM Press}, year = {2005}, isbn = {1-59593-140-6}, pages = {776--783}, location = {Bremen}, doi = {doi.acm.org/10.1145/1099554.1099739}, address = {New York, NY}, } @InProceedings{ treeratpituk06experimental, author = {Pucktada Treeratpituk and Jamie Callan}, title = {An experimental study on automatically labeling hierarchical clusters using statistical features}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, year = {2006}, isbn = {1-59593-369-7}, pages = {707--708}, location = {Seattle, WA}, doi = {doi.acm.org/10.1145/1148170.1148328}, address = {New York, NY}, } @Manual{ r05r, title = {R: {A} language and environment for statistical computing}, author = {{R Development Core Team}}, organization = {R Foundation for Statistical Computing}, address = {Vienna}, year = {2005}, note = {{ISBN} 3-900051-07-0}, url = {www.R-project.org}, } @Article{ tombros02effectiveness, author = {Anastasios Tombros and Robert Villa and Cornelis Joost {van~Rijsbergen}}, title = {The effectiveness of query-specific hierarchic clustering in information retrieval}, journal = {IP\&M}, volume = {38}, number = {4}, year = {2002}, issn = {0306-4573}, pages = {559--582}, doi = {dx.doi.org/10.1016/S0306-4573(01)00048-6}, publisher = {Pergamon Press}, address = {Tarrytown, NY}, } @Article{ schwarz78estimating, author = "Gideon Schwarz", title = "Estimating the dimension of a model", journal = {Annals of Statistics}, year = {1978}, volume = {6}, number = 2, pages = {461--464}, } @InProceedings{ pelleg00xmeans, year = {2000}, pages = {727-734}, publisher = {Morgan Kaufmann}, address = {San Francisco}, booktitle = {Proc. ICML}, author = {Dan Pelleg and Andrew Moore}, title = {X-means: {E}xtending K-means with Efficient Estimation of the Number of Clusters}, } @Article{ akaike74new, author = "Hirotugu Akaike", title = "A new look at the statistical model identification", journal = {{IEEE} Transactions on automatic control}, year = {1974}, volume = {19}, number = 6, pages = {716-723}, } @Article{ tibshirani01estimating, author = { Robert Tibshirani and Guenther Walther and Trevor Hastie}, title = {Estimating the number of clusters in a data set via the gap statistic}, journal = {Journal of the Royal Statistical Society Series~B}, volume = 63, year = 2001, pages = { 411--423}, } @InProceedings{ bradley98scaling, author = {Paul S. Bradley and Usama M. Fayyad and Cory Reina}, title = {Scaling Clustering Algorithms to Large Databases.}, booktitle = {Proc. KDD}, year = {1998}, pages = {9-15}, bibsource = {DBLP, http://dblp.uni-trier.de}, } @InProceedings{ fayyad98initialization, author = {Usama M. Fayyad and Cory Reina and Paul S. Bradley}, title = {Initialization of Iterative Refinement Clustering Algorithms.}, booktitle = {Proc. KDD}, year = {1998}, pages = {194-198}, bibsource = {DBLP, http://dblp.uni-trier.de}, } @inproceedings{ macqueen67some, author = "James B. MacQueen", title = "Some methods for classification and analysis of multivariate observations", booktitle = {Proc. Berkeley Symposium on Mathematics, Statistics and Probability}, pages = {281--297}, year = "1967", publisher = {University of California Press}, } % volume = 1, @Article{ lloyd82least, author = {Stuart P. Lloyd}, title = {Least squares quantization in {PCM}}, journal = {IEEE Transactions on Information Theory}, volume = {28}, number = {2}, year = {1982}, pages = {129-136}, bibsource = {DBLP, http://dblp.uni-trier.de}, } @InProceedings{ ji06document, author = {Xiang Ji and Wei Xu}, title = {Document clustering with prior knowledge}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, year = {2006}, isbn = {1-59593-369-7}, pages = {405--412}, location = {Seattle, WA}, doi = {doi.acm.org/10.1145/1148170.1148241}, address = {New York, NY}, } @PhDThesis{ strehl02relationship, author = {Alexander Strehl}, title = {Relationship-based Clustering and Cluster Ensembles for High-dimensional Data Mining}, year = {2002}, month = {May}, school = {The University of Texas at Austin}, } @InProceedings{ yang06near, author = {Hui Yang and Jamie Callan}, title = {Near-duplicate detection by instance-level constrained clustering}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, year = {2006}, isbn = {1-59593-369-7}, pages = {421--428}, location = {Seattle, Washington}, doi = {doi.acm.org/10.1145/1148170.1148243}, address = {New York, NY}, } @Book{ salton75dynamic, author = {Gerard Salton}, title = {Dynamic information and library processing}, year = {1975}, isbn = {0132213257}, publisher = {Prentice Hall}, address = {Upper Saddle River, NJ}, } @InProceedings{ liu04cluster, author = {Xiaoyong Liu and W. Bruce Croft}, title = {Cluster-based retrieval using language models}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, year = {2004}, isbn = {1-58113-881-4}, pages = {186--193}, location = {Sheffield}, doi = {doi.acm.org/10.1145/1008992.1009026}, address = {New York, NY}, } @Article{ hearst06clustering, author = {Marti A. Hearst}, title = {Clustering versus faceted categories for information exploration}, publisher = {ACM Press}, journal = {CACM}, volume = {49}, number = {4}, year = {2006}, issn = {0001-0782}, pages = {59--61}, doi = {doi.acm.org/10.1145/1121949.1121983}, address = {New York, NY}, } @InProceedings{ zamir99grouper, author = {Oren Zamir and Oren Etzioni}, title = {Grouper: {A} dynamic clustering interface to Web search results}, booktitle = {Proc. WWW}, year = {1999}, pages = {1361--1374}, location = {Toronto}, doi = {dx.doi.org/10.1016/S1389-1286(99)00054-7}, publisher = {Elsevier North-Holland}, address = {New York, NY}, } @Article{ hubert85comparing, author = {Lawrence Hubert and Phipps Arabie}, journal = {Journal of Classification}, pages = {193--218}, title = {Comparing partitions}, volume = {2}, year = {1985}, } @Article{ rand71objective, author = {William M. Rand}, journal = {Journal of the American Statistical Association}, pages = {846--850}, title = {Objective criteria for the evaluation of clustering methods}, volume = {66}, number = 336, year = {1971}, } @InProceedings{ hamerly03kmeans, author = {Greg Hamerly and Charles Elkan}, title = {Learning the $k$ in $k$-means.}, booktitle = {Proc. NIPS}, year = {2003}, url = {books.nips.cc/papers/files/nips16/NIPS2003\_AA36.pdf} , bibsource = {DBLP, http://dblp.uni-trier.de}, } @InProceedings{ vaithyanathan00modelbased, author = {Shivakumar Vaithyanathan and Byron Dom}, title = {Model-Based Hierarchical Clustering}, booktitle = {Proc. UAI}, year = {2000}, isbn = {1-55860-709-9}, pages = {599--608}, publisher = {Morgan Kaufmann}, address = {San Francisco, CA}, } @InProceedings{ lewis96training, author = {David D. Lewis and Robert E. Schapire and James P. Callan and Ron Papka}, title = {Training algorithms for linear text classifiers}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, year = {1996}, isbn = {0-89791-792-8}, pages = {298--306}, location = {Zurich}, doi = {doi.acm.org/10.1145/243199.243277}, address = {New York, NY}, } @Article{ dietterich95multiclass, author = {Thomas G. Dietterich and Ghulum Bakiri}, title = {Solving Multiclass Learning Problems via Error-Correcting Output Codes.}, journal = {Journal of Artificial Intelligence Research}, volume = {2}, year = {1995}, pages = {263-286}, bibsource = {DBLP, http://dblp.uni-trier.de}, } @InProceedings{ kaki05findex, author = {Mika K{\"a}ki}, title = {Findex: {S}earch result categories help users when document ranking fails}, booktitle = {Proc. SIGCHI}, publisher = {ACM Press}, year = {2005}, isbn = {1-58113-998-5}, pages = {131--140}, doi = {doi.acm.org/10.1145/1054972.1054991}, address = {New York, NY}, } @Article{ allwein00reducing, author = {Erin L. Allwein and Robert E. Schapire and Yoram Singer}, title = {Reducing Multiclass to Binary: {A} Unifying Approach for Margin Classifiers.}, journal = {JMLR}, volume = {1}, year = {2000}, pages = {113--141}, url = {www.jmlr.org/papers/volume1/allwein00a/allwein00a.pdf} , bibsource = {DBLP, http://dblp.uni-trier.de}, } @InProceedings{ lewis98naive, author = {David D. Lewis}, title = {Naive ({B}ayes) at Forty: {T}he Independence Assumption in Information Retrieval}, booktitle = {Proc. ECML}, year = {1998}, isbn = {3-540-64417-2}, pages = {4--15}, publisher = {Springer}, address = {London, UK}, } @InProceedings{ ng01discriminative, author = {Andrew Y. Ng and Michael I. Jordan}, title = {On Discriminative vs. Generative Classifiers: {A} comparison of logistic regression and naive {B}ayes.}, booktitle = {Proc. NIPS}, year = {2001}, pages = {841-848}, url = {www-2.cs.cmu.edu/Groups/NIPS/NIPS2001/papers/psgz/AA28.ps.gz} , bibsource = {DBLP, http://dblp.uni-trier.de}, } @Book{ snedecor89, title = {Statistical methods}, author = {George Waddel Snedecor and William G. Cochran}, year = 1989, publisher = {Iowa State University Press}, } @Book{ harold04xml, author = {Elliotte Rusty Harold and Scott W. Means}, howpublished = {Paperback}, isbn = {0596007647}, keywords = {xml}, month = {October}, publisher = {O'Reilly}, title = {{XML} in a Nutshell}, edition = {3rd}, year = {2004}, } @InProceedings{ mass02juruxml, author = {Yosi Mass and Matan Mandelbrod and Einat Amitay and David Carmel and Yo{\"e}lle S. Maarek and Aya Soffer}, title = {{JuruXML} -- {A}n {XML} Retrieval System at {INEX'02}}, booktitle = {Proc. INEX}, year = {2003}, pages = {73-80}, crossref = {fuhr03inex}, bibsource = {DBLP, http://dblp.uni-trier.de}, } @InProceedings{ govert03overview, author = {Norbert G{\"o}vert and Gabriella Kazai}, title = {Overview of the {INitiative for the Evaluation of {XML}} retrieval ({INEX}) 2002}, pages = {1--17}, year = {2003}, crossref = {fuhr03inex}, entrydate = 20030226, key = {Goevert/Kazai:03}, } @InProceedings{ carmel03fragments, author = {David Carmel and Yoelle S. Maarek and Matan Mandelbrod and Yosi Mass and Aya Soffer}, title = {Searching {XML} documents via {XML} fragments}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, year = {2003}, isbn = {1-58113-646-3}, pages = {151--158}, location = {Toronto}, doi = {doi.acm.org/10.1145/860435.860464}, address = {New York, NY}, } @InProceedings{anh06pruned, author = {Vo Ngoc Anh and Alistair Moffat}, title = {Pruned query evaluation using pre-computed impacts}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, year = {2006}, isbn = {1-59593-369-7}, pages = {372--379}, location = {Seattle, WA}, doi = {doi.acm.org/10.1145/1148170.1148235}, address = {New York, NY}, } @Article{ moura00fast, author = {de Moura, Edleno Silva and Gonzalo Navarro and Nivio Ziviani and Ricardo Baeza-Yates}, title = {Fast and flexible word searching on compressed text}, publisher = {ACM Press}, journal = {TOIS}, volume = {18}, number = {2}, year = {2000}, issn = {1046-8188}, pages = {113--139}, doi = {doi.acm.org/10.1145/348751.348754}, address = {New York, NY}, } @InProceedings{persin94filtered, author = {Michael Persin}, title = {Document filtering for fast ranking}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, year = {1994}, pages = {339--348}, address = {New York, NY}, } @Article{ persin96filtered, author = {Michael Persin and Justin Zobel and Ron Sacks-Davis}, title = {Filtered document retrieval with frequency-sorted indexes}, journal = {JASIS}, volume = {47}, number = {10}, year = {1996}, issn = {0002-8231}, pages = {749--764}, publisher = {John Wiley \& Sons}, address = {New York, NY}, } @InProceedings{ scholer02inverted, author = {Falk Scholer and Hugh E. Williams and John Yiannis and Justin Zobel}, title = {Compression of inverted indexes for fast query evaluation}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, year = {2002}, isbn = {1-58113-561-0}, pages = {222--229}, location = {Tampere, Finland}, doi = {doi.acm.org/10.1145/564376.564416}, address = {New York, NY}, } @Article{anh05invertedindex, author = {Vo Ngoc Anh and Alistair Moffat}, title = {Inverted Index Compression Using Word-Aligned Binary Codes}, journal = {IR}, volume = {8}, number = {1}, year = {2005}, issn = {1386-4564}, pages = {151--166}, doi = {dx.doi.org/10.1023/B:INRT.0000048490.99518.5c}, publisher = {Kluwer}, address = {Hingham, MA}, } @Article{ moffat96selfindexing, author = {Alistair Moffat and Justin Zobel}, title = {Self-indexing inverted files for fast text retrieval}, publisher = {ACM Press}, journal = {TOIS}, volume = {14}, number = {4}, year = {1996}, issn = {1046-8188}, pages = {349--379}, address = {New York, NY}, } @InProceedings{anh01termination, author = {Vo Ngoc Anh and Owen de Kretser and Alistair Moffat}, title = {Vector-space ranking with effective early termination}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, year = {2001}, isbn = {1-58113-331-6}, pages = {35--42}, location = {New Orleans, LA}, address = {New York, NY}, } @InProceedings{ dean04mapreduce, author = {Jeffrey Dean and Sanjay Ghemawat}, title = {{MapReduce}: {S}implified Data Processing on Large Clusters}, booktitle = {Proc. Symposium on Operating System Design and Implementation}, address = {San Francisco, CA}, year = 2004, } @Article{ harman90retrieving, author = {Donna Harman and Gerald Candela}, title = {Retrieving records from a gigabyte of text on a minicomputer using statistical ranking}, journal = {JASIS}, volume = 41, number = 8, year = 1990, pages = "581--589", } @InProceedings{ steinbach00comparison, author = "Michael Steinbach and George Karypis and Vipin Kumar", title = "A comparison of document clustering techniques", booktitle = {KDD Workshop on Text Mining}, year = "2000", } @InProceedings{ glover02structure, author = {Eric J. Glover and Kostas Tsioutsiouliklis and Steve Lawrence and David M. Pennock and Gary W. Flake}, title = {Using web structure for classifying and describing web pages}, booktitle = {Proc. WWW}, publisher = {ACM Press}, year = {2002}, isbn = {1-58113-449-5}, pages = {562--569}, location = {Honolulu, HI}, doi = {doi.acm.org/10.1145/511446.511520}, address = {New York, NY}, } @Article{ jain99data, author = {Anil Jain and M. Narasimha Murty and Patrick Flynn}, title = {Data clustering: {A} review}, publisher = {ACM Press}, journal = {ACM Computing Surveys}, volume = {31}, number = {3}, year = {1999}, issn = {0360-0300}, pages = {264--323}, address = {New York, NY}, } @Article{ ward63hierarchical, author = {J. H. {Ward Jr.}}, title = {Hierarchical grouping to optimize an objective function}, journal = {Journal of the American Statistical Association}, volume = 58, pages = {236-244}, year = 1963, } @InProceedings{ elhamdouchi86hierarchic, author = {Abdelmoula El-Hamdouchi and Peter Willett}, title = {Hierarchic document classification using {Ward}'s clustering method}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, year = {1986}, isbn = {0-89791-187-3}, pages = {149--156}, location = {Pisa}, doi = {doi.acm.org/10.1145/253168.253200}, address = {New York, NY}, } @Article{ murtagh83survey, author = {Fionn Murtagh}, title = {A Survey of Recent Advances in Hierarchical Clustering Algorithms.}, journal = {Computer Journal}, volume = {26}, number = {4}, year = {1983}, pages = {354--359}, } @Book{ cormen90algorithms, author = "Thomas H. Cormen and Charles Eric Leiserson and Ronald L. Rivest", title = "Introduction to Algorithms", publisher = {MIT Press}, address = {Cambridge MA}, pages = "xvii + 1028", year = "1990", isbn = "0-262-03141-8, 0-07-013143-0 (McGraw Hill)", isbn-13 = "978-0-262-03141-7, 978-0-07-013143-9 (McGraw Hill)", lccn = "QA76.6 .C662 1990", } @Article{ day84efficient, author = {William H. Day and Herbert Edelsbrunner}, year = 1984, title = {Efficient Algorithms for Agglomerative Hierarchical Clustering Methods}, journal = {Journal of Classification}, volume = 1, pages = {1-24}, } @Article{ king67stepwise, author = {Benjamin King}, title = {Step-wise clustering procedures}, journal = {Journal of the American Statistical Association}, volume = {69}, year = {1967}, pages = {86-101}, } @Book{ sneath73numerical, author = "Peter H.A. Sneath and Robert R. Sokal", title = "Numerical Taxonomy: {T}he Principles and Practice of Numerical Classification", publisher = "W.H. Freeman", address = "San Francisco", year = 1973, isbn = "0 7167 0697 0", } @TechReport{ voorhees85effectiveness, author = "Ellen M. Voorhees", title = {The Effectiveness and Efficiency of Agglomerative Hierarchic Clustering in Document Retrieval}, institution = "Cornell", year = 1985, number = {TR 85-705}, } @Unpublished{ popescul00automatic, title = {Automatic Labeling of Document Clusters}, author = {Alexandrin Popescul and Lyle H. Ungar}, note = {Unpublished \textsc{ms}, U. Pennsylvania}, year = 2000, url = {http://www.cis.upenn.edu/~popescul/Publications/popescul00labeling.pdf} } @InProceedings{ mckeown95generating, author = {Kathleen McKeown and Dragomir R. Radev}, title = {Generating summaries of multiple news articles}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, year = {1995}, isbn = {0-89791-714-6}, pages = {74--82}, location = {Seattle, WA}, doi = {doi.acm.org/10.1145/215206.215334}, address = {New York, NY}, } @InProceedings{ glover02inferring, author = {Eric Glover and David M. Pennock and Steve Lawrence and Robert Krovetz}, title = {Inferring hierarchical descriptions}, booktitle = {Proc. CIKM}, publisher = {ACM Press}, year = {2002}, isbn = {1-58113-492-4}, pages = {507--514}, location = {McLean, VA}, doi = {doi.acm.org/10.1145/584792.584876}, address = {New York, NY}, } @InCollection{ darrell06locality, title = {Locality-sensitive hashing using stable distributions}, booktitle = {Nearest Neighbor Methods in Learning and Vision: {T}heory and Practice}, author = {Alexandr Andoni and Mayur Datar and Nicole Immorlica and Piotr Indyk and Vahab Mirrokni}, editors = {T. Darrell and P. Indyk and G. Shakhnarovich}, publisher = {MIT Press}, year = 2006, } @Article{ cover67nearest, author = {Thomas M. Cover and Peter E. Hart}, title = {Nearest neighbor pattern classification}, journal = {IEEE Transactions on Information Theory}, volume = {13}, number = {1}, year = {1967}, pages = {21-27}, } @InProceedings{ yang94expert, author = {Yiming Yang}, title = {Expert network: {E}ffective and efficient learning from human decisions in text categorization and retrieval}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, pages = {13--22}, year = 1994, } @InProceedings{ turtle94boolean, author = {Howard Turtle}, year = {1994}, title = {Natural language vs. {B}oolean query evaluation: {A} comparison of retrieval performance}, pages = {212--220}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, } @InProceedings{ lita03truecasing, author = {Lucian Vlad Lita and Abe Ittycheriah and Salim Roukos and Nanda Kambhatla}, title = {{tRuEcasIng}}, booktitle = {Proc. ACL}, year = 2003, pages = {152--159}, } @InProceedings{ boldi05skiplists, author = {Paolo Boldi and Sebastiano Vigna}, title = {Compressed perfect embedded skip lists for quick inverted-index lookups}, booktitle = {Proc. SPIRE}, publisher = {Springer}, year = 2005, } % series = {Lecture Notes in Computer Science}, @Book{ manning99foundations, author = {Christopher D. Manning and Hinrich Sch{\"u}tze}, title = {Foundations of Statistical Natural Language Processing}, year = 1999, address = {Cambridge, MA}, publisher = {MIT Press}, } @InProceedings{ bahle02phrase, author = {Dirk Bahle and Hugh E. Williams and Justin Zobel}, year = 2002, title = {Efficient Phrase Querying with an Auxiliary Index}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, pages = {215--221}, } @Article{ williams04phrase, author = {Hugh E. Williams and Justin Zobel and Dirk Bahle}, year = 2004, title = {Fast Phrase Querying With Combined Indexes}, journal = {TOIS}, volume = {22}, number = {4}, pages = {573--594}, } @InProceedings{ sproat03bakeoff, author = {Richard Sproat and Thomas Emerson}, year = 2003, title = {The First International {C}hinese Word Segmentation Bakeoff}, booktitle = {SIGHAN Workshop on Chinese Language Processing}, } @Book{ witten99gigabytes, author = {Ian H. Witten and Alistair Moffat and Timothy C. Bell}, title = {Managing Gigabytes: {C}ompressing and Indexing Documents and Images}, publisher = {Morgan Kaufmann}, address = {San Francisco, CA}, year = 1999, edition = {2nd}, } @InProceedings{ mccallum98comparison, author = {Andrew McCallum and Kamal Nigam}, title = {A Comparison of Event Models for {N}aive {B}ayes Text Classification}, year = 1998, booktitle = {AAAI/ICML Workshop on Learning for Text Categorization}, pages = {41--48}, } @Article{ friedman97bias, author = {Jerome H. Friedman}, title = {On Bias, Variance, 0/1--Loss, and the Curse-of-Dimensionality}, journal = {Data Mining and Knowledge Discovery}, year = {1997}, volume = {1}, number = {1}, pages = {55--77}, annote = {Also, Technical Report, Stanford University, 1996}, } @Book{ duda00pattern, author = {Richard O. Duda and Peter E. Hart and David G. Stork}, title = {Pattern Classification}, edition = {2nd}, year = {2000}, isbn = {0471056693}, publisher = {Wiley-Interscience}, } @InProceedings{ croft78cluster, author = {W. Bruce Croft}, title = {A file organization for cluster-based retrieval}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, year = {1978}, pages = {65--82}, address = {New York, NY}, } @Article{ sebastiani02automated, author = {Fabrizio Sebastiani}, title = {Machine Learning in Automated Text Categorization}, journal = {ACM Computing Surveys}, volume = 34, number = 1, pages = {1--47}, year = 2002, } @InProceedings{ yang97selection, author = {Yiming Yang and Jan Pedersen}, title = {Feature selection in statistical learning of text categorization}, booktitle = {Proc. ICML}, year = 1997, } @InProceedings{ li03loss, author = {Fan Li and Yiming Yang}, title = {A Loss Function Analysis for Classification Methods in Text Categorization.}, booktitle = {Proc. ICML}, year = {2003}, pages = {472-479}, } @Book{ voorhees05experiment, editor = {Ellen M. Voorhees and Donna Harman}, title = {TREC: {E}xperiment and Evaluation in Information Retrieval}, publisher = {MIT Press}, year = 2005, } @Article{ elias75universal, author = {Peter Elias}, title = {Universal Code word sets and representations of the integers}, journal = {IEEE Transactions on Information Theory}, volume = 21, number = 2, year = 1975, pages = {194--203}, } @Article{ domingos97optimality, author = "Pedro Domingos and Michael J. Pazzani", title = "On the Optimality of the Simple {B}ayesian Classifier under Zero-One Loss", journal = {Machine Learning}, volume = "29", number = "2-3", pages = "103-130", year = "1997", url = "citeseer.ist.psu.edu/domingos97optimality.html", } @Article{ zhang01text, author = "Tong Zhang and Frank J. Oles", title = "Text Categorization Based on Regularized Linear Classification Methods", journal = {IR}, volume = "4", number = "1", publisher = "Kluwer", pages = "5--31", year = "2001", url = "citeseer.ist.psu.edu/zhang00text.html", } @Article{ lewis04benchmark, author = {David D. Lewis and Yiming Yang and Tony G. Rose and Fan Li}, title = {{RCV1}: {A} New Benchmark Collection for Text Categorization Research}, journal = {JMLR}, volume = {5}, year = {2004}, issn = {1533-7928}, pages = {361--397}, publisher = {MIT Press}, } @Book{ joachims2002classify, author = {Thorsten Joachims}, title = {Learning to Classify Text Using Support Vector Machines}, publisher = {Kluwer}, year = 2002, } @Book{ hastie2001elements, title = {The Elements of Statistical Learning: {D}ata Mining, Inference, and Prediction}, author = {Trevor Hastie and Robert Tibshirani and Jerome H. Friedman}, publisher = {Springer}, address = {New York}, year = 2001, } @Book{ korfhage97, title = {Information Storage and Retrieval}, author = {Robert R. Korfhage}, year = 1997, publisher = {Wiley}, } @Book{ baezayates99, title = {Modern Information Retrieval}, author = {Ricardo Baeza-Yates and Berthier Ribeiro-Neto}, publisher = {Addison Wesley}, address = {Harlow}, year = 1999, } @Book{ chakrabarti02, author = "Soumen Chakrabarti", title = "Mining the Web: {A}nalysis of Hypertext and Semi Structured Data", publisher = {Morgan Kaufmann}, year = "2002", } @InCollection{ rocchio71, author = {J. J. Rocchio}, title = {Relevance feedback in information retrieval}, crossref = {salton71smart}, year = 1971, pages = {313--323}, } @InCollection{ salton71cluster, author = {Gerard Salton}, title = {Cluster search strategies and the optimization of retrieval effectiveness}, crossref = {salton71smart}, year = 1971, pages = {223--242}, } @Book{ salton71smart, editor = {Gerard Salton}, title = {The {SMART} Retrieval System -- Experiments in Automatic Document Processing}, booktitle = {The {SMART} Retrieval System -- Experiments in Automatic Document Processing}, publisher = {Prentice Hall}, address = {Englewood Cliffs, NJ}, year = 1971, } @Book{ zipf49human, author = {George Kingsley Zipf}, year = 1949, title = {Human Behavior and the Principle of Least Effort}, publisher = {Addison Wesley}, address = {Cambridge MA}, } @Article{ dunning93accurate, author = {Ted Dunning}, title = {Accurate Methods for the Statistics of Surprise and Coincidence}, year = 1993, journal = {Computational Linguistics}, volume = 19, number = 1, pages = {61--74}, } @Article{ jardine71hierarchic, author = {N. Jardine and Cornelis Joost {van~Rijsbergen}}, title = {The use of hierarchic clustering in information retrieval}, journal = {Information Storage and Retrieval}, volume = 7, pages = {217--240}, year = 1971, } @InProceedings{ singhal97, author = {Amit Singhal and Mandar Mitra and Chris Buckley}, title = {Learning Routing Queries in a Query Zone}, year = 1997, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, pages = {25--32}, } @InProceedings{schutze97projections, author = {Hinrich Sch{\"u}tze and Craig Silverstein}, title = {Projections for Efficient Document Clustering}, year = 1997, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, pages = {74--81}, } @InProceedings{ voorhees85, author = {Ellen M. Voorhees}, title = {The cluster hypothesis revisited}, year = 1985, pages = {188--196}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, } @InProceedings{ hp96, author = {Marti A. Hearst and Jan O. Pedersen}, title = {Reexamining the Cluster Hypothesis}, year = 1996, pages = {76--84}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, address = {Zurich}, } @InProceedings{ dum-95, author = "Susan T. Dumais", title = "Latent Semantic Indexing ({LSI}): {TREC}-3 Report", pages = {219--230}, year = 1995, booktitle = {Proc. TREC}, } @inproceedings{buckley95new, author = {Chris Buckley and Amit Singhal and Mandar Mitra}, title = {New Retrieval Approaches Using {SMART}: {TREC} 4}, booktitle = {Proc. TREC}, year = {1995}, ee = {trec.nist.gov/pubs/trec4/papers/Cornell_trec4.ps.gz}, bibsource = {DBLP, http://dblp.uni-trier.de}, } @inproceedings{vanrijsbergen89towards, author = {Cornelis Joost {van~Rijsbergen}}, title = {Towards an information logic}, booktitle = {Proc. SIGIR}, year = {1989}, isbn = {0-89791-321-3}, pages = {77--86}, location = {Cambridge, Massachusetts, United States}, doi = {doi.acm.org/10.1145/75334.75344}, publisher = {ACM Press}, address = {New York, NY, USA}, } @Book{ rij79, author = {Cornelis Joost {van~Rijsbergen}}, title = {Information Retrieval}, year = 1979, edition = {2nd}, address = {London}, publisher = {Butterworths}, } @Book{ roget, author = {P. M. Roget}, title = {Roget's International Thesaurus}, publisher = {Thomas Y. Crowell}, year = {1946}, address = {New York}, } @InProceedings{ ckp93, author = "Douglas R. Cutting and David R. Karger and Jan O. Pedersen", title = "Constant Interaction-Time {S}catter/{G}ather Browsing of Very Large Document Collections", booktitle = {Proc. SIGIR}, publisher = {ACM Press}, year = 1993, pages = {126--134}, } @InProceedings{ cutting92scattergather, author = {Douglas R. Cutting and Jan O. Pedersen and David Karger and John W. Tukey}, title = {{Scatter/Gather}: {A} Cluster-based Approach to Browsing Large Document Collections}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, year = 1992, pages = {318--329}, } @InProceedings{ qf93, author = {Yonggang Qiu and H.P. Frei}, title = {Concept Based Query Expansion}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, year = 1993, pages = {160--169}, } @Article{ turtle91, author = {Howard Turtle and W. Bruce Croft}, title = {Evaluation of an Inference Network-Based Retrieval Model}, year = 1991, journal = {TOIS}, volume = 9, number = 3, pages = {187--222}, } @InProceedings{ turtle89, author = {Howard Turtle and W. Bruce Croft}, title = {Inference networks for document retrieval}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, pages = {1--24}, year = 1989, } @InProceedings{ dumais93, author = "Susan T. Dumais", title = "Latent Semantic Indexing ({LSI}) and {TREC-2}", booktitle = {Proc. TREC}, pages = "105--115", year = 1993, } @Article{ dee90, author = {Scott Deerwester and Susan T. Dumais and George W. Furnas and Thomas K. Landauer and Richard Harshman}, year = 1990, title = "Indexing by latent semantic analysis", journal = {JASIS}, volume = 41, number = 6, pages = "391--407", } @Article{ robertson76relevance, author = {Stephen E. Robertson and Karen Sp{\"a}rck Jones}, year = 1976, title = {Relevance Weighting of Search Terms}, journal = {JASIS}, volume = 27, pages = {129--146}, } @Article{ spa72, author = {Sp{\"a}rck Jones, Karen}, year = 1972, title = {A statistical interpretation of term specificity and its application in retrieval}, journal = {Journal of Documentation}, volume = 28, number = 1, pages = {11--21}, } @Article{ sb90, author = {Gerard Salton and Chris Buckley}, year = 1990, title = {Improving Retrieval Performance by Relevance Feedback}, journal = {JASIS}, volume = 41, number = 4, pages = {288--297}, } @Article{ dlr77, author = {A.P. Dempster and N.M. Laird and D.B. Rubin}, year = 1977, title = {Maximum likelihood from incomplete data via the {EM} algorithm}, journal = {Journal of the Royal Statistical Society Series~B}, volume = 39, pages = {1--38}, } @Book{ jain88algorithms, author = {Anil K. Jain and Richard C. Dubes}, title = {Algorithms for Clustering Data}, address = {Englewood Cliffs, NJ}, publisher = {Prentice Hall}, year = 1988, } @InProceedings{ shp95, author = {Hinrich Sch{\"u}tze and David A. Hull and Jan O. Pedersen}, title = {A Comparison of Classifiers and Document Representations for the Routing Problem}, year = 1995, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, pages = {229--237}, } @InProceedings{ kupiec95, author = {Julian Kupiec and Jan Pedersen and Francine Chen}, title = {A Trainable Document Summarizer}, year = 1995, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, pages = {68--73}, } @InProceedings{ lewis95, author = {David D. Lewis}, title = {Evaluating and Optimizing Autonomous Text Classification Systems}, year = 1995, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, } @article{fuhr94b, author = {Norbert Fuhr and Ulrich Pfeifer}, title = {Probabilistic information retrieval as a combination of abstraction, inductive learning, and probabilistic assumptions}, journal = {TOIS}, volume = {12}, number = {1}, year = {1994}, issn = {1046-8188}, pages = {92--115}, doi = {doi.acm.org/10.1145/174608.174612}, publisher = {ACM Press}, address = {New York, NY, USA}, } @InProceedings{ccg94, author = {William S. Cooper and Aitao Chen and Fredric C. Gey}, title = {Full Text Retrieval based on Probabilistic Equations with Coefficients fitted by Logistic Regression}, year = 1994, pages = {57--66}, booktitle = {Proc. TREC}, } @Article{ fuhr89, author = {Norbert Fuhr}, title = {Optimum Polynomial Retrieval Functions Based on the Probability Ranking Principle}, journal = {TOIS}, volume = 7, number = 3, pages = {183--204}, year = 1989, } @Article{ saracevic88users, author = {Saracevic, Tefko and Kantor, Paul}, year = 1988, title = {A study of information seeking and retrieving. {II}: {U}sers, questions and effectiveness}, journal = {JASIS}, volume = 39, pages = {177--196}, } @Article{ saracevic88, author = "Tefko Saracevic and Paul Kantor", title = "A Study of Information Seeking and Retrieving. {III}: Searchers, Searches, Overlap", journal = {JASIS}, volume = 39, number = 3, pages = "197--216", year = 1996, } @InProceedings{ hull93using, author = "David Hull", title = "Using Statistical Testing in the Evaluation of Retrieval Performance", booktitle = {Proc. SIGIR}, publisher = {ACM Press}, pages = "329--338", year = 1993, } @Article{ schuetze98, author = {Hinrich Sch{\"u}tze}, title = {Automatic Word Sense Discrimination}, year = 1998, journal = {Computational Linguistics}, volume = 24, number = 1, pages = {97--124}, } @Book{ mitchell97machine, author = {Tom M. Mitchell}, title = {Machine Learning}, publisher = {McGraw Hill}, address = {New York}, year = 1997, } @Article{ croftharper79, author = {W. Bruce Croft and David J. Harper}, year = 1979, title = {Using Probabilistic Models of Document Retrieval Without Relevance Information}, journal = {Journal of Documentation}, volume = 35, number = 4, pages = {285--295}, } @Article{ moffatzobel98, author = {Alistair Moffat and Justin Zobel}, title = {Exploring the Similarity Space}, journal = {SIGIR Forum}, volume = 32, number = 1, year = 1998, } @InCollection{ rasmussen92, author = {Edie Rasmussen}, year = 1992, title = {Clustering Algorithms}, pages = {419--442}, crossref = {frakes92information} } @book{ frakes92information, editor = {William B. Frakes and Ricardo Baeza-Yates}, year = 1992, title = {Information Retrieval: {D}ata Structures and Algorithms}, publisher = {Prentice Hall}, address = {Englewood Cliffs, NJ}, } @Book{ kaufman90finding, title = {Finding groups in data}, year = 1990, address = {New York}, author = {Leonard Kaufman and Peter J. Rousseeuw}, publisher = {Wiley}, } @Article{ yang99evaluation, author = {Yiming Yang}, title = {An Evaluation of Statistical Approaches to Text Categorization}, year = 1999, journal = {IR}, volume = {1}, pages = {69--90}, } @InProceedings{ yang99re-examination, author = {Yiming Yang and Xin Liu}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, title = {A re-examination of text categorization methods}, year = 1999, pages = {42--49}, } @Book{ minskypapert88, editor = {Marvin Lee Minsky and Seymour Papert}, title = {Perceptrons: {A}n introduction to computational geometry}, address = {Cambridge, MA}, publisher = {MIT Press}, year = 1988, note = {Expanded edition.}, } @InProceedings{ lewis94comparison, author = {David D. Lewis and Marc Ringuette}, title = {A comparison of two learning algorithms for text categorization}, booktitle = {Proc. SDAIR}, year = "1994", address = "Las Vegas, NV", pages = {81--93}, } @Article{ berrydumais95, author = {Michael W. Berry and Susan T. Dumais and Gavin W. O'Brien}, journal = {SIAM Review}, volume = 37, number = 4, pages = {573--595}, title = {Using linear algebra for intelligent information retrieval}, year = {1995}, } @Article{ apte94automated, title = {Automated Learning of Decision Rules for Text Categorization}, author = {Chidanand Apt{\'e} and Fred Damerau and Sholom M. Weiss}, journal = {TOIS}, volume = 12, number = 1, year = 1994, pages = {233--251}, } @Article{ sproat96segmentation, author = {Richard Sproat and William Gale and Chilin Shih and Nancy Chang}, title = {A stochastic finite-state word-segmentation algorithm for {C}hinese}, journal = {Computational Linguistics}, year = 1996, pages = {377--404}, volume = 22, number = 3, } @InProceedings{ tseng05segmentation, author = {Huihsin Tseng and Pichuan Chang and Galen Andrew and Daniel Jurafsky and Christopher Manning}, year = 2005, title = {A Conditional Random Field Word Segmenter}, booktitle = {SIGHAN Workshop on Chinese Language Processing}, } @Book{ lunde98cjkv, author = {Ken Lunde}, title = {{CJKV} Information Processing}, year = 1998, publisher = {O'Reilly}, } @Article{ porter80stripping, author = {Martin F. Porter}, title = {An algorithm for suffix stripping}, year = 1980, journal = {Program}, volume = 14, number = 3, pages = {130--137}, } @Article{ lovins68stemming, author = {Julie Beth Lovins}, title = {Development of a stemming algorithm}, year = 1968, journal = {Translation and Computational Linguistics}, volume = 11, number = 1, pages = {22--31}, } @Article{ paice90stemmer, author = {Paice, Chris D.}, title = {Another Stemmer}, journal = {SIGIR Forum}, volume = 24, number = 3, pages = {56--61}, year = 1990, } @Article{ hull96stemming, author = {David Hull}, title = {Stemming algorithms -- {A} case study for detailed evaluation}, journal = {JASIS}, volume = 47, number = 1, pages = {70--84}, year = 1996, } @Book{ salton89automatic, author = {Gerard Salton}, title = {Automatic Text Processing: {T}he Transformation, Analysis, and Retrieval of Information by Computer}, publisher = {Addison Wesley}, address = {Reading, MA}, year = {1989}, } @Article{ harman91suffixing, author = {Donna Harman}, year = 1991, title = {How effective is suffixing?}, journal = {JASIS}, volume = 42, pages = {7--15}, } @PhDThesis{ krovetz95disambiguation, author = {Bob Krovetz}, year = 1995, title = {Word sense disambiguation for large text databases}, school = {University of Massachusetts Amherst}, } @InProceedings{ cleverdon91cranfield, author = {Cyril W. Cleverdon}, title = {The significance of the {C}ranfield tests on index languages}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, pages = {3--12}, year = 1991, } @InProceedings{ allan05hard, author = {James Allan}, title = {{HARD} Track Overview in {TREC} 2005: {H}igh Accuracy Retrieval from Documents}, year = 2005, booktitle = {Proc. TREC}, } @Article{ carletta96kappa, author = {Jean Carletta}, year = 1996, title = {Assessing Agreement on Classification Tasks: {T}he Kappa Statistic}, journal = {Computational Linguistics}, volume = {22}, pages = {249--254}, } @Book{ krippendorff03content, author = {Klaus Krippendorff}, title = {Content Analysis: {A}n Introduction to its Methodology}, year = 2003, publisher = {Sage}, } @Article{ lombard02content, author = { Matthew Lombard and Cheryl C. Bracken and Jennifer Snyder-Duch}, year = 2002, title = {Content analysis in mass communication: {A}ssessment and reporting of intercoder reliability}, journal = {Human Communication Research}, volume = 28, pages = {587--604}, } @InProceedings{ hersh94ohsumed, author = {William Hersh and Chris Buckley and T. J. Leone and David Hickam}, title = {{OHSUMED}: {A}n interactive retrieval evaluation and new large test collection for research}, booktitle = {Proc. SIGIR}, year = {1994}, isbn = {0-387-19889-X}, pages = {192--201}, location = {Dublin, Ireland}, publisher = {ACM Press}, address = {New York, NY, USA}, } @InProceedings{ carbonell98mmr, author = {Jaime Carbonell and Jade Goldstein}, title = {The use of {MMR}, diversity-based reranking for reordering documents and producing summaries}, booktitle = {Proc. SIGIR}, year = {1998}, isbn = {1-58113-015-5}, pages = {335--336}, location = {Melbourne, Australia}, doi = {doi.acm.org/10.1145/290941.291025}, publisher = {ACM Press}, address = {New York, NY, USA}, } @Article{ kekalainen05relevance, author = {Jaana Kek{\"a}l{\"a}inen}, title = {Binary and graded relevance in {IR} evaluations -- {C}omparison of the effects on ranking of {IR} systems}, journal = {IP\&M}, volume = 41, year = 2005, pages = {1019--1033}, } @Article{ harter98relevance, author = {Stephen P. Harter}, title = {Variations in relevance assessments and the measurement of retrieval effectiveness}, journal = {JASIS}, volume = {47}, year = 1998, pages = {37--49}, } @InProceedings{ barzilay97chains, author = {Regina Barzilay and Michael Elhadad}, title = {Using Lexical Chains for Text Summarization}, booktitle = {Workshop on Intelligent Scalable Text Summarization}, year = 1997, pages = {10--17}, } @InProceedings{ jing00reduction, author = {Hongyan Jing}, title = {Sentence reduction for automatic text summarization}, booktitle = {Proc. Conference on Applied Natural Language Processing}, year = 2000, pages = {310--315}, } @Misc{ fallows04internet, author = {Deborah Fallows}, year = {2004}, title = {The Internet and Daily Life}, note = {Pew/Internet and {A}merican Life Project}, url = {www.pewinternet.org/pdfs/PIP\_Internet\_and\_Daily\_Life.pdf} , } @InProceedings{ newsam01image, author = {Shawn Newsam and Sitaram Bhagavathy and B. S. Manjunath}, title = {Category-based image retrieval}, booktitle = {Proc. IEEE International Conference on Image Processing, Special Session on Multimedia Indexing, Browsing and Retrieval}, pages = {596--599}, year = 2001, } % volume = {3}, @InProceedings{ salton91panel, author = {Gerard Salton}, title = {The {S}mart Project in Automatic Document Retrieval}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, year = 1991, pages = {356--358}, } @InProceedings{ harman92revisited, author = {Donna Harman}, title = {Relevance feedback revisited}, year = 1992, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, pages = {1--10}, } @InCollection{ ide71, author = {E. Ide}, title = {New experiments in relevance feedback}, editor = {Gerard Salton}, year = 1971, pages = {337--354}, crossref = {salton71smart}, } @Article{ ruthven03relevance, author = {Ruthven, Ian and Lalmas, Mounia}, year = 2003, title = {A survey on the use of relevance feedback for information access systems}, journal = {Knowledge Engineering Review}, volume = 18, number = 1, } @InProceedings{ buckley94relevance, author = {Chris Buckley and Gerard Salton and James Allan}, year = 1994, title = {The effect of adding relevance information in a relevance feedback environment}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, pages = {292--300}, } @Article{ spink00use, author = {Amanda Spink and Bernard J. Jansen and H. Cenk Ozmultu}, title = {Use of query reformulation and relevance feedback by {E}xcite users}, journal = {Internet Research: {E}lectronic Networking Applications and Policy}, volume = 10, year = 2000, number = 4, pages = {317--328}, url = {ist.psu.edu/faculty\_pages/jjansen/academic/pubs/internetresearch2000.pdf} , } @InProceedings{ xu96query, author = {Jinxi Xu and W. Bruce Croft}, year = {1996}, title = {Query Expansion Using Local and Global Document Analysis}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, pages = {4--11}, } @InProceedings{ joachims05clickthrough, author = {Thorsten Joachims and Laura Granka and Bing Pan and Helene Hembrooke and Geri Gay}, title = {Accurately Interpreting Clickthrough Data as Implicit Feedback}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, year = 2005, pages = {154--161}, } @InProceedings{ joachims02clickthrough, author = {Thorsten Joachims}, title = {Optimizing Search Engines Using Clickthrough Data}, booktitle = {Proc. KDD}, year = 2002, pages = {133--142}, } @InProceedings{ greiff98eda, author = {Warren R. Greiff}, year = 1998, title = {A theory of term weighting based on exploratory data analysis}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, pages = {11-19}, } @InProceedings{ friedman96tan, author = {Friedman, Nir and Moises Goldszmidt}, year = 1996, title = {Building Classifiers using {B}ayesian Networks}, booktitle = {Proc. National Conference on Artificial Intelligence}, pages = {1277--1284}, } @Book{ ripley96, author = {B. D. Ripley}, title = {Pattern Recognition and Neural Networks}, publisher = {Cambridge University Press}, address = {Cambridge}, year = {1996}, annote = {Great statistical foundations of classification book!}, } @Article{ fuhr92probabilistic, author = "Norbert Fuhr", title = "Probabilistic Models in Information Retrieval", journal = {Computer Journal}, volume = "35", number = "3", pages = "243--255", year = "1992", } @article{ crestani98probabilistic, author = {Fabio Crestani and Mounia Lalmas and Cornelis J. Van Rijsbergen and Iain Campbell}, title = "Is this document relevant?\ \ldots{} probably: {A} survey of probabilistic models in information retrieval", journal = {ACM Computing Surveys}, volume = {30}, number = {4}, year = {1998}, issn = {0360-0300}, pages = {528--552}, doi = {doi.acm.org/10.1145/299917.299920}, publisher = {ACM Press}, address = {New York, NY, USA}, } @Article{ sparckjones00probabilistic, author = {Sp{\"a}rck Jones, Karen and S. Walker and Stephen E. Robertson}, title = {A probabilistic model of information retrieval: Development and comparative experiments}, journal = {IP\&M}, volume = 36, number = 6, pages = {779--808, 809--840}, year = 2000, } @Book{ jensen01bayesian, author = {Finn V. Jensen and Finn B. Jensen}, title = {Bayesian Networks and Decision Graphs}, year = 2001, publisher = {Springer}, address = {Berlin}, } @Book{ grinstead97probability, author = {Grinstead, Charles M. and J. Laurie Snell}, year = 1997, edition = "2nd", title = {Introduction to Probability}, publisher = {American Mathematical Society}, address = {Providence, RI}, url = {www.dartmouth.edu/{\urltilde}chance/teaching\_aids/books\_articles/probability\_book/amsbook.mac.pdf} , } @Book{ jurafsky00slp, author = {Dan Jurafsky and James H. Martin}, title = {Speech and Language Processing: {A}n Introduction to Natural Language Processing, Computational Linguistics and Speech Recognition}, publisher = {Prentice Hall}, address = {Englewood Cliffs, NJ}, year = 2000, } @Book{ jurafsky08slp, author = {Dan Jurafsky and James H. Martin}, title = {Speech and Language Processing: {A}n Introduction to Natural Language Processing, Computational Linguistics and Speech Recognition}, edition = {2nd}, publisher = {Prentice Hall}, address = {Englewood Cliffs, NJ}, year = {2008} } @InProceedings{ ponte98lm, author = {Jay M. Ponte and W. Bruce Croft}, title = {A language modeling approach to information retrieval}, year = 1998, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, pages = {275--281}, } @InProceedings{ miller99hmm, author = {David R. H. Miller and Tim Leek and Richard M. Schwartz}, title = {A Hidden {Markov} Model Information Retrieval System}, year = 1999, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, pages = {214--221}, } @InProceedings{ berger99ir, author = {Adam Berger and John Lafferty}, title = {Information retrieval as statistical translation}, year = 1999, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, pages = {222--229}, } @InProceedings{ hiemstra98linguistically, author = {Djoerd Hiemstra}, title = {A linguistically motivated probabilistic model of information retrieval}, booktitle = {Proc. ECDL}, series = {LNCS}, volume = {1513}, pages = {569--584}, year = {1998}, } @Article{ hiemstra00probabilistic, author = {Djoerd Hiemstra}, title = {A probabilistic justification for using tf.idf term weighting in information retrieval}, journal = {International Journal on Digital Libraries}, volume = 3, number = 2, publisher = {Springer}, pages = {131--139}, year = 2000, issn = {1432-5012}, } @InProceedings{ lafferty01risk, author = {John Lafferty and Chengxiang Zhai}, year = 2001, title = {Document language models, query models, and risk minimization for information retrieval}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, pages = {111--119}, } @InProceedings{ zhai01smoothing, author = {Chengxiang Zhai and John Lafferty}, title = {A study of smoothing methods for language models applied to ad hoc information retrieval}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, year = 2001, pages = {334--342}, } @Unpublished{ sparckjones04rational, author = {Sp{\"a}rck Jones, Karen}, title = "{Language modelling's generative model: {I}s it rational?}", year = 2004, note = {MS, Computer Laboratory, University of Cambridge}, url = {www.cl.cam.ac.uk/{\urltilde}ksj21/langmodnote4.pdf}, } @InProceedings{ zhai01feedback, author = {Chengxiang Zhai and John Lafferty}, title = {Model-based feedback in the language modeling approach to information retrieval}, booktitle = {Proc. CIKM}, publisher = {ACM Press}, year = 2001, } @Book{ croft03lm, editor = {Croft, W. Bruce and John Lafferty}, title = {Language Modeling for Information Retrieval}, year = 2003, address = {New York}, publisher = {Springer}, } @InProceedings{ caruana06empirical, author = {Rich Caruana and Alexandru Niculescu-Mizil}, title = {An Empirical Comparison of Supervised Learning Algorithms}, year = 2006, booktitle = {Proc. ICML}, } @InProceedings{ dumais98inductive, author = {Susan Dumais and John Platt and David Heckerman and Mehran Sahami}, title = {Inductive learning algorithms and representations for text categorization}, booktitle = {Proc. CIKM}, year = {1998}, isbn = {1-58113-061-9}, pages = {148--155}, location = {Bethesda, Maryland, United States}, doi = {doi.acm.org/10.1145/288627.288651}, publisher = {ACM Press}, address = {New York, NY, USA}, } @Article{ hand06classifier, author = {David J. Hand}, title = {Classifier Technology and the Illusion of Progress}, year = 2006, journal = {Statistical Science}, volume = 21, pages = {1--14}, } @Book{ shawe-taylor04kernel, author = {John Shawe-Taylor and Nello Cristianini}, title = {Kernel Methods for Pattern Analysis}, year = 2004, publisher = {Cambridge University Press}, } @Book{ cristianini00svm, author = {Nello Cristianini and John Shawe-Taylor}, year = 2000, title = {Introduction to Support Vector Machines and Other Kernel-based Learning Methods}, publisher = {Cambridge University Press}, } @Book{ schoelkopf01kernels, author = {Bernhard Sch{\"o}lkopf and Alexander J. Smola}, title = {Learning with Kernels: {S}upport Vector Machines, Regularization, Optimization, and Beyond}, year = 2001, publisher = {MIT Press}, } @Article{ burges98svm, author = {Burges, Christopher J. C.}, title = {A Tutorial on Support Vector Machines for Pattern Recognition}, journal = {Data Mining and Knowledge Discovery}, volume = 2, number = 2, pages = {121--167}, year = 1998, } @Article{ chen05nusvm, author = {Pai-Hsuen Chen and Chih-Jen Lin and Bernhard Sch{\"o}lkopf}, year = 2005, title = {A tutorial on $\nu$-Support Vector Machines}, journal = {Applied Stochastic Models in Business and Industry}, volume = 21, pages = {111--136}, } @Book{ vapnik98statistical, author = {Vladimir N. Vapnik}, title = {Statistical Learning Theory}, publisher = {Wiley-Interscience}, year = 1998, } @Article{ lodhi02text, author = {Huma Lodhi and Craig Saunders and John Shawe-Taylor and Nello Cristianini and Chris Watkins}, year = 2002, title = {Text Classification using String Kernels}, journal = {JMLR}, volume = 2, pages = {419--444}, } @InProceedings{ gaertner02kernels, author = {Thomas Gaertner and John W. Lloyd and Peter A. Flach}, year = 2002, title = {Kernels for structured data}, booktitle = {Proc. International Conference on Inductive Logic Programming}, pages = {66--83}, } @InProceedings{ joachims98text, author = "Thorsten Joachims", title = "Text categorization with support vector machines: {L}earning with many relevant features", booktitle = {Proc. ECML}, publisher = {Springer}, address = "Heidelberg", pages = "137--142", year = "1998", } % series = {Lecture Notes in Artificial Intelligence}, % number = "1398", % editor = "Claire N{\'e}dellec and C{\'e}line Rouveirol", @Article{ 359041, author = {James L. Peterson}, title = {Computer programs for detecting and correcting spelling errors}, publisher = {ACM Press}, journal = {CACM}, volume = {23}, number = {12}, year = {1980}, issn = {0001-0782}, pages = {676--687}, doi = {doi.acm.org/10.1145/359038.359041}, address = {New York, NY}, } @Article{ 363994, author = {Fred J. Damerau}, title = {A technique for computer detection and correction of spelling errors}, publisher = {ACM Press}, journal = {CACM}, volume = {7}, number = {3}, year = {1964}, issn = {0001-0782}, pages = {171--176}, doi = {doi.acm.org/10.1145/363958.363994}, address = {New York, NY}, } @Article{ 146380, author = {Karen Kukich}, title = {Techniques for automatically correcting words in text}, publisher = {ACM Press}, journal = {ACM Computing Surveys}, volume = {24}, number = {4}, year = {1992}, issn = {0360-0300}, pages = {377--439}, doi = {doi.acm.org/10.1145/146370.146380}, address = {New York, NY}, } @Article{ permuterm, author = {Eugene Garfield}, title = "The Permuterm Subject Index: {A}n Autobiographic Review", journal = {JASIS}, year = {1976}, pages = {288--291}, volume = {27}, number = {5-6}, } @Article{ zobel95finding, author = "Justin Zobel and Philip Dart", title = "Finding Approximate Matches in Large Lexicons", journal = {Software Practice and Experience}, year = "1995", volume = "25", number = "3", pages = "331--345", url = "citeseer.ifi.unizh.ch/zobel95finding.html", } @Article{ editdistance, author = "Vladimir I. Levenshtein", title = "Binary codes capable of correcting spurious insertions and deletions of ones", journal = {Problems of Information Transmission}, volume = {1}, pages = {8--17}, year = {1965}, } @Article{ liu05svms, author = {Tie-Yan Liu and Yiming Yang and Hao Wan and Hua-Jun Zeng and Zheng Chen and Wei-Ying Ma}, title = {Support Vector Machines Classification with Very Large Scale Taxonomy}, journal = {ACM SIGKDD Explorations}, volume = 7, number = 1, pages = {36--43}, year = 2005, } @Article{ weigend99hierarchy, author = {Andreas S. Weigend and Erik D. Wiener and Jan O. Pedersen}, title = {Exploiting Hierarchy in Text Categorization}, year = 1999, journal = {IR}, volume = 1, number = 3, pages = {193--216}, } @InProceedings{ koller97hierarchy, author = {Koller, Daphne and Sahami, Mehran}, year = 1997, title = {Hierarchically Classifying Documents Using Very Few Words}, booktitle = {Proc. ICML}, pages = {170-178}, } @InProceedings{ dumais00hierarchical, author = "Susan T. Dumais and Hao Chen", title = "Hierarchical classification of {W}eb content", booktitle = {Proc. SIGIR}, publisher = {ACM Press}, pages = "256--263", year = "2000", } @Article{ pugh90skip, author = {William Pugh}, title = {Skip lists: {A} probabilistic alternative to balanced trees}, journal = {CACM}, volume = 33, number = 6, pages = {668--676}, year = 1990, } @InProceedings{papineni01why, author = {Kishore Papineni}, title = {Why Inverse Document Frequency?}, booktitle = {Proc. North American Chapter of the Association for Computational Linguistics}, year = 2001, pages = {1--8}, } @techreport{ buckleysalton_termweighting, author = {Gerard Salton and Chris Buckley}, title = {Term Weighting Approaches in Automatic Text Retrieval}, year = {1987}, institution = {Cornell University}, address = {Ithaca, NY, USA}, } @Article{ salton88term, author = {Gerard Salton and Christopher Buckley}, title = {Term-Weighting Approaches in Automatic Text Retrieval}, year = {1988}, journal = {IP\&M}, volume = {24}, number = {5}, pages = {513--523}, } @Article{ luhn57, author = {Hans Peter Luhn}, title = {A statistical approach to mechanized encoding and searching of literary information}, journal = {IBM Journal of Research and Development}, volume = {1}, number = {4}, pages = {309--317}, year = {1957}, } @Article{ luhn58, author = {Hans Peter Luhn}, title = {The Automatic Creation of Literature Abstracts}, journal = {IBM Journal of Research and Development}, volume = {2}, number = {2}, pages = {159--165, 317}, year = {1958}, } @InProceedings{ singhal96pivoted, author = "Amit Singhal and Chris Buckley and Mandar Mitra", title = "Pivoted Document Length Normalization", booktitle = {Proc. SIGIR}, publisher = {ACM Press}, pages = "21-29", year = "1996", url = "citeseer.ist.psu.edu/singhal96pivoted.html", } @InProceedings{ toutanova02pronunciation, author = {Kristina Toutanova and Robert C. Moore}, title = {Pronunciation Modeling for Improved Spelling Correction}, booktitle = {Proc. ACL}, year = 2002, pages = {144--151}, } @InProceedings{ kernighan90spelling, author = {Mark D. Kernighan and Kenneth W. Church and William A. Gale}, title = {A spelling correction program based on a noisy channel model}, year = 1990, booktitle = {Proc. ACL}, pages = {205--210}, } % volume = 2, @InProceedings{ brill00improved, author = {Eric Brill and Robert C. Moore}, year = 2000, title = {An improved error model for noisy channel spelling correction}, booktitle = {Proc. ACL}, pages = {286--293}, } @TechReport{ viewingterm, author = "Ruihua Song and Ji-Rong Wen and Wei-Ying Ma", title = {Viewing Term Proximity from a Different Perspective}, institution = "Microsoft Research", year = 2005, number = {MSR-TR-2005-69}, } @Article{ onetothree, author = {Charles L.A. Clarke and Gordon V. Cormack and Elizabeth A. Tudhope}, title = {Relevance ranking for one to three term queries}, journal = {IP\&M}, volume = {36}, year = {2000}, pages = {291--311}, } @Article{ gao05chinese, title = {{C}hinese Word Segmentation and Named Entity Recognition: A Pragmatic Approach}, author = {Jianfeng Gao and Mu Li and Chang-Ning Huang and Andi Wu}, journal = {Computational Linguistics}, month = {Dec}, year = 2005, volume = 31, number = 4, pages = {531--574}, } @InProceedings{ cavnar94ngram, author = {William B. Cavnar and John M. Trenkle}, title = {N-Gram-Based Text Categorization}, booktitle = {Proc. SDAIR}, pages = {161--175}, year = 1994, } @TechReport{ dunning94identification, author = {Ted Dunning}, title = {Statistical Identification of Language}, institution = {Computing Research Laboratory, New Mexico State University}, number = {94-273}, year = 1994, howpublished = {Inside the package \url{ftp://crl.nmsu.edu/pub/misc/lingdet\_suite.tar.gz}}, } @Book{ konheim81cryptography, author = {Alan G. Konheim}, year = 1981, title = {Cryptography: {A} Primer}, publisher = {John Wiley \& Sons}, } @InProceedings{ beesley88identifier, author = {Kenneth R. Beesley}, year = 1998, title = {Language Identifier: {A} Computer Program for Automatic Natural-Language Identification of On-Line Text}, booktitle = {Languages at Crossroads: {P}roc. Annual Conference of the American Translators Association}, pages = {47--54}, } @InProceedings{ hughes06identification, author = {Hughes, Baden and Baldwin, Timothy and Bird, Steven and Nicholson, Jeremy and MacKinlay, Andrew}, year = 2006, title = {Reconsidering Language Identification for Written Language Resources}, booktitle = {Proc. International Conference on Language Resources and Evaluation}, pages = {485--488}, } @InProceedings{ mckeown02news, author = {Kathleen R. McKeown and Regina Barzilay and David Evans and Vasileios Hatzivassiloglou and Judith L. Klavans and Ani Nenkova and Carl Sable and Barry Schiffman and Sergey Sigelman}, title = {Tracking and Summarizing News on a Daily Basis with {C}olumbia's {N}ewsblaster}, booktitle = {Proc. Human Language Technology Conference}, year = 2002, } @InProceedings{ chen00multilingual, author = {Hsin-Hsi Chen and Chuan-Jie Lin}, year = 2000, title = {A Multilingual News Summarizer}, booktitle = {Proc. COLING}, pages = {159-165}, } @Book{ sproat92morphology, author = {Sproat, Richard William}, title = {Morphology and computation}, publisher = {MIT Press}, address = {Cambridge, MA}, year = {1992}, } @Book{ beesley03finite, title = {Finite State Morphology}, author = {Kenneth R. Beesley and Lauri Karttunen}, publisher = {CSLI Publications}, address = {Stanford, CA}, year = 2003, } @Article{ zobel06inverted, author = {Justin Zobel and Alistair Moffat}, year = {2006}, title = {Inverted Files for Text Search Engines}, journal = {ACM Computing Surveys}, volume = {38}, number = {2}, } @TechReport{ mercator1, author = "Marc Najork and Allan Heydon", title = {High-Performance Web Crawling}, institution = "Compaq Systems Research Center", year = 2001, number = {173}, } @InCollection{ najorkheydon2002, author = "Marc Najork and Allan Heydon", title = "High-Performance Web Crawling", booktitle = {Handbook of Massive Data Sets}, publisher = "Kluwer", year = 2002, editor = {James Abello and Panos Pardalos and Mauricio Resende}, chapter = 2, } @InProceedings{brinpageanatomy, author = {Sergey Brin and Lawrence Page}, year = 1998, title = {The anatomy of a large-scale hypertextual Web search engine}, booktitle = {Proc. WWW}, pages = {107--117}, } @InProceedings{ chocrawling1998, author = {Junghoo Cho and Hector Garcia-Molina and Lawrence Page}, year = 1998, title = {Efficient crawling through {URL} ordering}, booktitle = {Proc. WWW}, pages = {161--172}, } @InProceedings{ webbase2000, author = {Jun Hirai and Sriram Raghavan and Hector Garcia-Molina and Andreas Paepcke}, year = 2000, title = {{WebBase}: {A} repository of web pages}, booktitle = {Proc. WWW}, pages = {277--293}, } @Article{ burnercrawling, author = {Mike Burner}, title = {Crawling towards Eternity: {B}uilding an archive of the {W}orld {W}ide {W}eb}, journal = {Web Techniques Magazine}, volume = 2, number = 5, year = 1997, } @InProceedings{ connserver, author = {Krishna Bharat and Andrei Broder and Monika Henzinger and Puneet Kumar and Suresh Venkatasubramanian}, year = 1998, title = {The connectivity server: {F}ast access to linkage information on the Web}, booktitle = {Proc. WWW}, pages = {469--477}, } @InProceedings{ boldivigna1, author = {Paolo Boldi and Sebastiano Vigna}, year = 2004, title = "{The WebGraph framework I: {C}ompression techniques}", booktitle = {Proc. WWW}, publisher = {ACM Press}, pages = {595--601}, } @Article{ boldivigna2, author = "Paolo Boldi and Sebastiano Vigna", title = {Codes for the {World-Wide Web}}, journal = {Internet Mathematics}, year = 2004, pages = {405--427}, volume = {2}, number = {4}, } @TechReport{ page98pagerank, author = "Lawrence Page and Sergey Brin and Rajeev Motwani and Terry Winograd", institution = "Stanford Digital Library Technologies Project", title = "The {PageRank} Citation Ranking: {B}ringing Order to the Web", year = "1998", url = "citeseer.ist.psu.edu/page98pagerank.html", } @InProceedings{ haveliwala02topicsensitive, author = "Taher H. Haveliwala", title = "Topic-sensitive {PageRank}", booktitle = {Proc. WWW}, address = "Honolulu, HI", month = may, year = 2002, url = "citeseer.ist.psu.edu/haveliwala02topicsensitive.html", } @article{ haveliwala03topicsensitive, author = "Taher Haveliwala", title = "Topic-sensitive {PageRank}: {A} context-sensitive ranking algorithm for web search", journal = {IEEE Transactions on Knowledge and Data Engineering}, volume = 15, number = 4, pages = {784--796}, year = "2003", url = "citeseer.ist.psu.edu/article/haveliwala03topicsensitive.html" , } @Article{ kleinberg99authoritative, author = "Jon M. Kleinberg", title = "Authoritative sources in a hyperlinked environment", journal = {JACM}, volume = "46", number = "5", pages = "604--632", year = "1999", url = "citeseer.ist.psu.edu/article/kleinberg98authoritative.html" , } @InProceedings{jehwidom, author = {Glen Jeh and Jennifer Widom}, title = "{Scaling personalized web search}", booktitle = {Proc. WWW}, publisher = {ACM Press}, year = {2003}, pages = {271--279}, location = {Budapest}, address = {New York, NY}, } @InProceedings{ chakrabarti98automatic, author = "Soumen Chakrabarti and Byron Dom and David Gibson and Jon Kleinberg and Prabhakar Raghavan and Sridhar Rajagopalan", title = "Automatic resource list compilation by analyzing hyperlink structure and associated text", booktitle = {Proc. WWW}, year = "1998", url = "citeseer.ist.psu.edu/chakrabarti98automatic.html", } @InProceedings{ bharat98improved, author = "Krishna Bharat and Monika R. Henzinger", title = "Improved algorithms for topic distillation in a hyperlinked environment", booktitle = {Proc. SIGIR}, publisher = {ACM Press}, address = "Melbourne, AU", pages = "104--111", year = "1998", url = "citeseer.ist.psu.edu/bharat98improved.html", } @InProceedings{ ng01link, author = "Andrew Y. Ng and Alice X. Zheng and Michael I. Jordan", title = "Link Analysis, Eigenvectors and Stability", booktitle = {Proc. IJCAI}, pages = "903-910", year = "2001", url = "citeseer.ist.psu.edu/ng01link.html", } @InProceedings{ borodintsaparas, author = {Allan Borodin and Gareth O. Roberts and Jeffrey S. Rosenthal and Panayiotis Tsaparas}, title = {Finding authorities and hubs from link structures on the {World Wide Web}}, booktitle = {Proc. WWW}, year = "2001", pages = "415--429", } @Article{ lempel00stochastic, author = {Ronny Lempel and Shlomo Moran}, title = "The stochastic approach for link-structure analysis {({SALSA})} and the {TKC} effect", journal = {Computer Networks}, volume = "33", number = "1--6", pages = "387--401", year = "2000", url = "citeseer.ist.psu.edu/lempel00stochastic.html", } @TechReport{ baeza05choice, abstract = {This paper studies a family of link-based algorithms that propagate page importance through links. In these algorithms there is a damping function that decreases with the distance, so a direct link implies more endorsement than a link through a long path. {PageRank} is the most widely known ranking function of this family. We focus on three damping functions, having linear, exponential, and hyperbolic decay on the lengths of the paths. The exponential decay corresponds to {PageRank}, and the other functions are new. Our analysis includes a comparison among them and experiments for studying their behavior under different parameters.}, author = {Ricardo Baeza-Yates and Paolo Boldi and Carlos Castillo}, citeulike-article-id={322774}, institution = {Dipartimento di Scienze dell'Informazione, Universit\`{a} degli Studi di Milano}, keywords = {ranking web-graph}, month = {September}, priority = {0}, title = {The Choice of a Damping Function for Propagating Importance in Link-Based Ranking}, year = {2005}, } @inproceedings{ boldi05pagerank, author = {Paolo Boldi and Massimo Santini and Sebastiano Vigna}, title = "{PageRank} as a function of the damping factor", booktitle = {Proc. WWW}, year = "2005", url = "citeseer.ist.psu.edu/boldi05pagerank.html", } @Article{ berkhinpagerank, author = "Pavel Berkhin", title = "A survey on pagerank computing", journal = {Internet Mathematics}, volume = "2", number = "1", pages = "73--120", year = "2005", } @InProceedings{ boldi02ubicrawler, author = {Paolo Boldi and Bruno Codenotti and Massimo Santini and Sebastiano Vigna}, title = "Ubicrawler: {A} scalable fully distributed web crawler", booktitle = {Proc. Australian World Wide Web Conference}, year = "2002", url = "citeseer.ist.psu.edu/article/boldi03ubicrawler.html", } @InProceedings{ shkapenyuk02design, author = "Vladislav Shkapenyuk and Torsten Suel", title = "Design and Implementation of a High-Performance Distributed Web Crawler", booktitle = {Proc. International Conference on Data Engineering}, year = "2002", url = "citeseer.ist.psu.edu/shkapenyuk02design.html", } @Article{ 321094, author = {Charles P. Bourne and Donald F. Ford}, title = {A Study of Methods for Systematically Abbreviating {E}nglish Words and Names}, publisher = {ACM Press}, journal = {JACM}, volume = {8}, number = {4}, year = {1961}, issn = {0004-5411}, pages = {538--552}, doi = {doi.acm.org/10.1145/321088.321094}, address = {New York, NY}, } @InProceedings{garcia04access, title = {Access-ordered indexes}, booktitle = {Proc. Australasian Conference on Computer Science}, pages = {7--14}, year = 2004, author = {Steven Garcia and Hugh E. Williams and Adam Cannane}, } @InCollection{robertson05okapi, author = {Stephen Robertson}, title = {How {O}kapi came to {TREC}}, crossref = {voorhees05experiment}, year = 2005, pages = {287--299}, } % editor = {E.M. Voorhees and D.K. Harman}, % booktitle = {{TREC}: {E}xperiments and Evaluation in Information % Retrieval}, % publisher = {MIT Press}, @Article{ aizerman64theoretical, author = {Mark A. Aizerman and Emmanuel M. Braverman and Lev I. Rozono\'{e}r}, year = 1964, title = {Theoretical foundations of the potential function method in pattern recognition learning}, journal = {Automation and Remote Control}, volume = 25, pages = {821--837}, } @InProceedings{ radev01interactive, author = {Dragomir R. Radev and Sasha Blair-Goldensohn and Zhu Zhang and Revathi Sundara Raghavan}, year = 2001, title = {Interactive, Domain-Independent Identification and Summarization of Topically Related News Articles}, booktitle = {Proc. European Conference on Research and Advanced Technology for Digital Libraries}, pages = {225--238}, } @Book{ knuthvol3, author = {Donald E. Knuth}, year = {1997}, title = {The Art of Computer Programming, Volume 3: {S}orting and Searching}, publisher = {Addison Wesley}, edition = {3rd}, } @InProceedings{ cohen98learning, author = "William W. Cohen and Robert E. Schapire and Yoram Singer", title = "Learning to Order Things", booktitle = {Proc. NIPS}, publisher = "The {MIT} Press", year = "1998", url = "citeseer.ist.psu.edu/article/cohen98learning.html", } % editor = "Michael I. Jordan and Michael J. Kearns and Sara A. Solla", % volume = "10", @InProceedings{ zaragoza03bayesian, author = {Hugo Zaragoza and Djoerd Hiemstra and Michael Tipping and Stephen Robertson}, title = {Bayesian Extension to the Language Model for Ad Hoc Information Retrieval}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, year = {2003}, pages = {4--9}, } @InCollection{ hiemstra05lm, author = {Djoerd Hiemstra and Wessel Kraaij}, title = {A Language-Modeling Approach to {TREC}}, crossref = {voorhees05experiment}, year = 2005, pages = {373--395}, } @InProceedings{ gao04dependence, author = {Jianfeng Gao and Jian-Yun Nie and Guangyuan Wu and Guihong Cao}, title = {Dependence language model for information retrieval}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, pages = {170--177}, year = 2004, } @InProceedings{ cao05integrating, author = {Guihong Cao and Jian-Yun Nie and Jing Bai}, title = {Integrating word relationships into language models}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, pages = {298--305}, year = 2005, } @InProceedings{ santos01distributed, author = {Claudine Santos Badue and Ricardo A. Baeza-Yates and Berthier Ribeiro-Neto and Nivio Ziviani}, title = {Distributed Query Processing Using Partitioned Inverted Files}, booktitle = {Proc. SPIRE}, year = 2001, pages = {10--20}, } @InProceedings{ ribeiro-neto98query, author = {Berthier A. Ribeiro-Neto and Ramurti A. Barbosa}, title = {Query Performance for Tightly Coupled Distributed Digital Libraries}, booktitle = {Proc. ACM Conference on Digital Libraries}, year = 1998, pages = {182--190}, } @Article{ tomasic93query, author = {Anthony Tomasic and Hector Garcia-Molina}, title = {Query Processing and Inverted Indices in Shared-Nothing Document Information Retrieval Systems}, journal = {VLDB Journal}, volume = {2}, number = {3}, year = {1993}, pages = {243--275}, } @Article{ jeong95inverted, author = {Byeong-Soo Jeong and Edward Omiecinski}, title = {Inverted File Partitioning Schemes in Multiple Disk Systems}, journal = {IEEE Transactions on Parallel and Distributed Systems}, volume = 6, number = 2, pages = {142--153}, year = 1995, } @InProceedings{ macfarlane00parallel, author = {A. MacFarlane and J.A. McCann and S.E. Robertson}, title = {Parallel Search using Partitioned Inverted Files}, booktitle = {Proc. SPIRE}, year = {2000}, pages = {209--220}, } @InCollection{ liddy05automatic, author = {Elizabeth D. Liddy}, year = 2005, title = {Automatic Document Retrieval}, booktitle = {Encyclopedia of Language and Linguistics}, edition = {2nd}, publisher = {Elsevier}, } @Article{ bush45memex, author = {Vannevar Bush}, title = {As We May Think}, journal = {The Atlantic Monthly}, year = {1945}, url = {www.theatlantic.com/doc/194507/bush}, } @Book{ taube58information, editor = {Mortimer Taube and Harold Wooster}, year = 1958, title = {Information storage and retrieval: {T}heory, systems, and devices}, address = {New York}, publisher = {Columbia University Press}, } @InCollection{ mooers61mathematical, author = {Mooers, Calvin}, title = {From a point of view of mathematical etc. techniques}, pages = {xvii--xxiii}, editor = {Fairthorne, R. A.}, booktitle = {Towards information retrieval}, address = {London}, publisher = {Butterworths}, year = 1961, } @inproceedings{long03optimized, author = "Xiaohui Long and Torsten Suel", title = "Optimized Query Execution in Large Search Engines with Global Page Ordering", booktitle = {Proc. VLDB}, year = "2003", url = "citeseer.ist.psu.edu/long03optimized.html", } @Book{ spink05cognitive, editor = {Amanda Spink and Charles Cole}, title = {New Directions in Cognitive Information Retrieval}, year = 2005, publisher = {Springer}, } @InProceedings{ zobel96phonetic, author = "Justin Zobel and Philip Dart", title = "Phonetic String Matching: {L}essons from Information Retrieval", year = 1996, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, pages = "166--173", } @InProceedings{ cucerzan04spelling, author = {Silviu Cucerzan and Eric Brill}, title = {Spelling Correction as an Iterative Process that Exploits the Collective Knowledge of Web Users}, booktitle = {Proc. Empirical Methods in Natural Language Processing}, year = 2004, } @InProceedings{chierichetti2007, author = {Flavio Chierichetti and Alessandro Panconesi and Prabhakar Raghavan and Mauro Sozio and Alessandro Tiberi and Eli Upfal}, title = {Finding Near Neighbors Through Cluster Pruning}, booktitle = {Proc. PODS}, year = 2007, } @Article{ eckartyoung, author = {Carl Eckart and Gale Young}, title = {The approximation of a matrix by another of lower rank}, journal = {Psychometrika}, volume = 1, pages = {211-218}, year = 1936, } @InProceedings{th:plsi, author = "Thomas Hofmann", title = "{P}robabilistic {L}atent {S}emantic {I}ndexing", booktitle = {Proc. SIGIR}, publisher = {ACM Press}, address = "Berkeley, California", pages = "50-57", month = "August", year = "1999", url = "citeseer.ist.psu.edu/article/hofmann99probabilistic.html", } @InProceedings{hofmann99probabilistic, author = "Thomas Hofmann", title = "{P}robabilistic {L}atent {S}emantic {I}ndexing", booktitle = {Proc. UAI}, address = "Stockholm", year = "1999", url = "citeseer.ist.psu.edu/hofmann99probabilistic.html", } @Book{ strang, editor = {Gilbert Strang}, title = {Introduction to Applied Mathematics}, year = 1986, publisher = {Wellesley-Cambridge Press}, } @Article{bernerslee92worldwide, author = "Tim Berners-Lee and Robert Cailliau and Jean-Francois Groff and Bernd Pollermann", title = "{World-Wide Web}: {T}he Information Universe", journal = {Electronic Networking: {R}esearch, Applications and Policy}, volume = "1", number = "2", pages = "74-82", year = "1992", url = "citeseer.ist.psu.edu/article/berners-lee92worldwide.html", } @InProceedings{kumar00the, author = "S. Ravi Kumar and Prabhakar Raghavan and Sridhar Rajagopalan and Dandapani Sivakumar and Andrew Tomkins and Eli Upfal", title = "{T}he {W}eb as a Graph", booktitle = {Proc. PODS}, publisher = {ACM Press}, pages = "1--10", year = "2000", url = "citeseer.ist.psu.edu/article/kumar00web.html", } @InProceedings{mcbryan94genvl, author = "Oliver A. McBryan", title = "{GENVL and {WWWW}: {T}ools for Taming the Web}", booktitle = {Proc. WWW}, address = "Geneva", year = "1994", url = "citeseer.ist.psu.edu/mcbryan94genvl.html", } % editor = "O. Nierstarsz", @InProceedings{bgmz97shingling, author = {Andrei Z. Broder and Steven C. Glassman and Mark S. Manasse and Geoffrey Zweig}, title = "{Syntactic clustering of the web}", booktitle = {Proc. WWW}, pages = "391--404", year = "1997", } @Article{792552, author = {Andrei Broder}, title = {A taxonomy of web search}, publisher = {ACM Press}, journal = {SIGIR Forum}, volume = {36}, number = {2}, year = {2002}, issn = {0163-5840}, pages = {3--10}, doi = {doi.acm.org/10.1145/792550.792552}, address = {New York, NY}, } @Article{440656, author = {Andrei Broder and S. Ravi Kumar and Farzin Maghoul and Prabhakar Raghavan and Sridhar Rajagopalan and Raymie Stata and Andrew Tomkins and Janet Wiener}, journal = {Computer Networks}, month = {June}, number = {1}, pages = {309--320}, title = {Graph structure in the Web}, volume = {33}, year = {2000}, } @Article{297863, author = {Krishna Bharat and Andrei Broder}, title = {A technique for measuring the relative size and overlap of public Web search engines}, journal = {Computer Networks and ISDN Systems}, volume = {30}, number = {1-7}, year = {1998}, issn = {0169-7552}, pages = {379--388}, doi = {dx.doi.org/10.1016/S0169-7552(98)00127-5}, publisher = {Elsevier}, address = {Amsterdam}, } @Article{lawrence98searching, author = "Steve Lawrence and C. Lee Giles", title = "Searching the {World Wide Web}", journal = {Science}, volume = "280", number = "5360", pages = "98--100", year = "1998", url = "citeseer.ist.psu.edu/lawrence98searching.html", } @InProceedings{rusmevichientong01methods, author = "Paat Rusmevichientong and David M. Pennock and Steve Lawrence and C. Lee Giles", title = "Methods for Sampling Pages Uniformly from the World Wide Web", booktitle = {Proc. {AAAI} Fall Symposium on Using Uncertainty Within Computation}, pages = "121--128", year = "2001", url = "citeseer.ist.psu.edu/rusmevichientong01methods.html", } @Article{ lawrence99giles, author = "Steve Lawrence and C. Lee Giles", title = "Accessibility of information on the Web", journal = {Nature}, volume = "500", pages = "107--109", year = "1999", } @InProceedings{346289, author = {Monika R. Henzinger and Allan Heydon and Michael Mitzenmacher and Marc Najork}, title = {On near-uniform {URL} sampling}, booktitle = {Proc. WWW}, year = {2000}, pages = {295--308}, location = {Amsterdam}, doi = {dx.doi.org/10.1016/S1389-1286(00)00055-4}, publisher = {North-Holland}, address = {Amsterdam, The Netherlands, The Netherlands}, } @InProceedings{1135833, author = {Ziv Bar-Yossef and Maxim Gurevich}, title = {Random sampling from a search engine's index}, booktitle = {Proc. WWW}, publisher = {ACM Press}, year = {2006}, isbn = {1-59593-323-9}, pages = {367--376}, location = {Edinburgh}, doi = {doi.acm.org/10.1145/1135777.1135833}, address = {New York, NY}, } @Article{bharat00comparison, author = "Krishna Bharat and Andrei Z. Broder and Jeffrey Dean and Monika Rauch Henzinger", title = "A comparison of techniques to find mirrored hosts on the {WWW}", journal = {JASIS}, volume = "51", number = "12", pages = "1114-1122", year = "2000", url = "citeseer.ist.psu.edu/bharat99comparison.html", } @InProceedings{511464, author = {Junghoo Cho and Hector Garcia-Molina}, title = {Parallel crawlers}, booktitle = {Proc. WWW}, publisher = {ACM Press}, year = {2002}, isbn = {1-58113-449-5}, pages = {124--135}, location = {Honolulu, HI}, doi = {doi.acm.org/10.1145/511446.511464}, address = {New York, NY}, } @InProceedings{carmel01static, author = {David Carmel and Doron Cohen and Ronald Fagin and Eitan Farchi and Michael Herscovici and Yoelle S. Maarek and Aya Soffer}, title = {Static index pruning for information retrieval systems}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, year = {2001}, isbn = {1-58113-331-6}, pages = {43--50}, location = {New Orleans, LA}, doi = {doi.acm.org/10.1145/383952.383958}, address = {New York, NY}, } @Book{ friedl06regular, author = {Jeffrey E. F. Friedl}, year = 2006, edition = {3rd}, title = {Mastering Regular Expressions}, publisher = {O'Reilly}, address = {Sebastopol, CA}, } @InProceedings{comperm, author = {Paolo Ferragina and Rossano Venturini}, title = {Compressed permuterm indexes}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, year = {2007}, address = {New York, NY}, } @Article{ maron60relevance, author = {Maron, M. E. and Kuhns, J. L.}, title = {On relevance, probabilistic indexing, and information retrieval}, journal = {JACM}, volume = 7, number = 3, pages = {216--244}, year = 1960, } @Article{ blair85evaluation, author = {David C. Blair and M. E. Maron}, year = 1985, title = {An Evaluation of Retrieval Effectiveness for a Full-Text Document-Retrieval System}, journal = {CACM}, volume = 28, number = 3, pages = {289--299}, } @Book{ siegel88nonparametric, author = {Sidney Siegel and Castellan, Jr., N. John}, title = "Nonparametric Statistics for the Behavioral Sciences", edition = {2nd}, publisher = {McGraw Hill}, address = {New York}, year = "1988", } @Article{ voorhees00variations, author = {Ellen M. Voorhees}, title = {Variations in Relevance Judgments and the Measurement of Retrieval Effectiveness}, journal = {IP\&M}, volume = {36}, pages = { 697--716}, year = {2000}, } @InProceedings{ aslam05geometric, author = {Javed A. Aslam and Emine Yilmaz}, year = 2005, title = {A geometric interpretation and analysis of {R}-precision}, booktitle = {Proc. CIKM}, publisher = {ACM Press}, pages = {664--671}, } @InProceedings{ strohman07efficient, author = {Trevor Strohman and Croft, W. Bruce}, title = {Efficient Document Retrieval in Main Memory}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, pages = {175--182}, year = 2007, } @Book{ gusfield97algorithms, author = {Dan Gusfield}, title = {Algorithms on Strings, Trees and Sequences: {C}omputer Science and Computational Biology}, year = 1997, publisher = {Cambridge University Press}, address = {Cambridge}, } @TechReport{ lee88experimental, author = {Lee, Whay C. and Fox, Edward A.}, year = 1988, title = {Experimental Comparison of Schemes for Interpreting {B}oolean Queries}, number = {TR-88-27}, institution = {Computer Science, Virginia Polytechnic Institute and State University}, } @Book{ hopcroft00automata, author = {John E. Hopcroft and Rajeev Motwani and Jeffrey D. Ullman}, title = {Introduction to Automata Theory, Languages, and Computation}, publisher = {Addison Wesley}, edition = {2nd}, year = 2000, } @Article{ johnson06effective, author = {Johnson, David and Malhotra, Vishv and Vamplew, Peter}, year = 2006, title = {More Effective Web Search Using Bigrams and Trigrams}, journal = {Webology}, volume = 3, number = {4}, note = {Article 35}, url = {www.webology.ir/2006/v3n4/a35.html}, } @InProceedings{ kammenhuber06web, author = {Nils Kammenhuber and Julia Luxenburger and Anja Feldmann and Gerhard Weikum}, title = {Web search clickstreams}, booktitle = {Proc. ACM SIGCOMM on Internet Measurement}, publisher = {ACM Press}, pages = {245--250}, year = 2006, address = {Rio de Janeiro, Brazil}, } @article{silverstein99analysis, author = {Craig Silverstein and Monika Rauch Henzinger and Hannes Marais and Michael Moricz}, title = {Analysis of a Very Large Web Search Engine Query Log}, journal = {SIGIR Forum}, volume = {33}, number = {1}, year = {1999}, pages = {6-12}, ee = {db/journals/sigir/SilversteinHMM99.html}, bibsource = {DBLP, http://dblp.uni-trier.de}, } @TechReport{ silverstein98analysis, author = {Craig Silverstein and Monika Henzinger and Hannes Marais and Michael Moricz}, title = {Analysis of a Very Large {A}lta{V}ista Query Log}, year = {1998}, institution = {Digital SRC}, number = {1998-014}, } @Article{ luk02comparison, title = {A comparison of {C}hinese document indexing strategies and retrieval models}, author = {Robert W. P. Luk and Kui-Lam Kwok}, journal = {ACM Transactions on Asian Language Information Processing}, year = {2002}, volume = 1, number = 3, pages = {225--268}, } @InProceedings{ kishida05clir, title = {Overview of {CLIR} Task at the Fifth {NTCIR} Workshop}, author = {Kazuaki Kishida and Kuang-Hua Chen and Sukhoon Lee and Kazuko Kuriyama and Noriko Kando and Hsin-Hsi Chen and Sung Hyon Myaeng}, booktitle = {{NTCIR} Workshop Meeting on Evaluation of Information Access Technologies: {I}nformation Retrieval, Question Answering and Cross-Lingual Information Access}, year = {2005}, publisher = {National Institute of Informatics}, address = {Tokyo}, } @Misc{ sifry07state, author = {Dave Sifry}, year = {2007}, title = {The State of the {L}ive {W}eb, {A}pril 2007}, url = {technorati.com/weblog/2007/04/328.html}, } @Article{ gerrand07estimating, author = {Gerrand, Peter}, year = {2007}, title = {Estimating linguistic diversity on the Internet: {A} taxonomy to avoid pitfalls and paradoxes}, journal = {Journal of Computer-Mediated Communication}, volume = {12}, number = {4}, note = {article 8}, url = {jcmc.indiana.edu/vol12/issue4/gerrand.html}, } @Article{ hollink04monolingual, author = {Vera Hollink and Jaap Kamps and Christof Monz and Maarten de Rijke}, title = {Monolingual Document Retrieval for {E}uropean Languages}, journal = {IR}, volume = {7}, number = {1}, pages = {33--52}, year = 2004, } @InProceedings{ tomlinson03lexical, author = {Stephen Tomlinson}, title = {Lexical and Algorithmic Stemming Compared for 9 {E}uropean Languages with {H}ummingbird {S}earchServer at {CLEF 2003}}, booktitle = {Proc. Cross-Language Evaluation Forum}, pages = {286--300}, year = 2003, } @Article{ barilan05how, author = {Judit Bar-Ilan and Tatyana Gutman}, title = {How do search engines respond to some non-{E}nglish queries?}, journal = {Journal of Information Science}, year = 2005, volume = 31, number = 1, pages = {13--28}, } @Book{ jackson02natural, author = {Jackson, Peter and Isabelle Moulinier}, year = 2002, title = {Natural Language Processing for Online Applications: {T}ext Retrieval, Extraction and Categorization}, publisher = {John Benjamins}, isbn = {1-58811-250-0}, } @InProceedings{ hayes90construe, author = {Philip J. Hayes and Steven P. Weinstein}, year = 1990, title = {{CONSTRUE/TIS}: {A} System for Content-Based Indexing of a Database of News Stories}, booktitle = {Proc. Conference on Innovative Applications of Artificial Intelligence}, pages = {49--66}, stanford = {Green or Math Q334 .I5433 1990}, } @InProceedings{ klein02conditional, author = {Dan Klein and Christopher D. Manning}, year = 2002, title = {Conditional Structure versus Conditional Estimation in {NLP} Models}, booktitle = {Proc. Empirical Methods in Natural Language Processing}, pages = {9--16}, } @InProceedings{ banko01scaling, author = {Michele Banko and Eric Brill}, year = 2001, title = {Scaling to Very Very Large Corpora for Natural Language Disambiguation}, booktitle = {Proc. ACL}, } @InCollection{ joachims99making, author = {Thorsten Joachims}, year = 1999, title = {Making large-Scale {SVM} Learning Practical}, booktitle = {Advances in Kernel Methods - Support Vector Learning}, editor = {B. Sch{\"o}lkopf and C. Burges and A. Smola}, publisher = {MIT Press}, } @Article{ garfield55, title = {Citation indexes to science: {A} new dimension in documentation through association of ideas}, author = {Eugene Garfield}, journal = {Science}, year = {1955}, volume = 122, pages = {108--111}, } @Article{ pinskinarin, title = {Citation Influence for Journal Aggregates of Scientific Publications: {T}heory, with Application to the Literature of {P}hysics}, author = {Gabriel Pinski and Francis Narin}, journal = {IP\&M}, year = {1976}, volume = 12, pages = {297--326}, } @Article{ kumar99trawling, author = "Ravi Kumar and Prabhakar Raghavan and Sridhar Rajagopalan and Andrew Tomkins", title = "Trawling the {Web} for emerging cyber-communities", journal = {Computer Networks}, volume = "31", number = "11--16", pages = "1481--1493", year = "1999", url = "citeseer.ist.psu.edu/kumar99trawling.html", } @Article{ jacobs90scisor, author = {Paul S. Jacobs and Lisa F. Rau}, title = {{SCISOR}: {E}xtracting Information from On-line News}, journal = {CACM}, year = 1990, volume = 33, pages = {88--97}, } @Article{ mooers50coding, author = {Mooers, Calvin E.}, year = 1950, title = {Coding, Information Retrieval, and the Rapid Selector}, journal = {American Documentation}, volume = 1, number = 4, pages = {225--229}, } @Article{ kent55operational, author = {Allen Kent and Madeline M. Berry and Luehrs, Jr., Fred U. and J. W. Perry}, year = 1955, title = {Machine Literature Searching {VIII}. {O}perational Criteria for Designing Information Retrieval Systems}, journal = {American Documentation}, volume = 6, number = 2, pages = {93--101}, } @Article{ swanson88historical, author = {Don R. Swanson}, year = 1988, title = {Historical Note: {I}nformation Retrieval and the Future of an Illusion}, journal = {JASIS}, volume = 39, number = 2, pages = {92--98}, } @incollection{ littman98automatic, author = {Michael L. Littman and Susan T. Dumais and Thomas K. Landauer}, title = "Automatic cross-language information retrieval using latent semantic indexing", editor = {Gregory Grefenstette}, booktitle = {Proc. Cross-Language Information Retrieval}, year = "1998", url = "citeseer.ist.psu.edu/littman98automatic.html", publisher = {Kluwer}, } @Article{ berryyoung1995, author = {Michael Berry and Paul Young}, year = 1995, title = {Using latent semantic indexing for multilanguage information retrieval}, journal = {Computers and the Humanities}, volume = 29, number = 6, pages = {413--429}, } @Book{ kemenysnell, title = {Finite {M}arkov Chains}, address = {New York}, author = {John G. Kemeny and J. Laurie Snell}, publisher = {Springer}, year = {1976}, } @PhDThesis{brown95, author = {Eric W. Brown}, title = {Execution Performance Issues in Full-Text Information Retrieval}, school = {University of Massachusetts, Amherst}, year = 1995, } @Article{ berkhinbsa, author = {Pavel Berkhin}, year = 2006, title = {Bookmark-Coloring Algorithm for Personalized Pagerank Computing}, journal = {Internet Mathematics}, volume = 3, number = 1, pages = {41--62}, } @InProceedings{ murata00japanese, author = {Masaki Murata and Qing Ma and Kiyotaka Uchimoto and Hiromi Ozaku and Masao Utiyama and Hitoshi Isahara}, year = 2000, title = {Japanese probabilistic information retrieval using location and category information}, booktitle = {International Workshop on Information Retrieval With {A}sian Languages}, pages = {81--88}, url = {portal.acm.org/citation.cfm?doid=355214.355226}, annote = {Improves ad hoc IR results (IREX) by upweighting terms in title and first sentence of newswire docs. Doesn't clearly distinguish the effectiveness of the two, but}, } @Article{ ko04improving, author = {Youngjoong Ko and Jinwoo Park and Jungyun Seo}, title = {Improving text categorization using the importance of sentences}, year = 2004, journal = {IP\&M}, volume = 40, number = 1, pages = {65--79}, } @Article{ cohen99context, author = {William W. Cohen and Yoram Singer}, year = {1999}, title = {Context-Sensitive Learning Methods for Text Categorization}, journal = {TOIS}, volume = 17, number = 2, pages = {141--173}, } @InProceedings{ kolcz01summarization, author = {Ko{\l}cz, Aleksander and Prabakarmurthi, Vidya and Kalita, Jugal}, title = {Summarization as feature selection for text categorization}, booktitle = {Proc. CIKM}, publisher = {ACM Press}, year = 2000, pages = {365--370}, } @Article{ kozlov79polynomial, author = {Kozlov, M. K. and Tarasov, S. P. and Khachiyan, L. G.}, title = {Polynomial Solvability of Convex Quadratic Programming}, journal = {Soviet Mathematics Doklady}, volume = 20, year = 1979, pages = {1108--1111}, note = {Translated from original in \emph{Doklady Akademiia Nauk SSR}, 228 (1979)}, } @InProceedings{ kolcz07raising, author = {Aleksander Ko{\l}cz and {Wen-Tau} Yih}, title = {Raising the Baseline for High-Precision Text Classifiers}, booktitle = {Proc. KDD}, year = 2007, } @InCollection{ platt00probabilistic, author = {John Platt}, title = {Probabilistic outputs for support vector machines and comparisons to regularized likelihood methods}, editor = {A.J. Smola and P.L. Bartlett and B. Sch{\"o}lkopf and D. Schuurmans}, booktitle = {Advances in Large Margin Classifiers}, pages = {61--74}, publisher = {MIT Press}, address = {Cambridge, MA}, year = 2000, } @InProceedings{ weston99svms, author = {Jason Weston and Chris Watkins}, title = {Support Vector Machines for Multi-class Pattern Recognition}, year = 1999, booktitle = {Proc. European Symposium on Artificial Neural Networks}, pages = {219--224}, } @Article{ crammer01algorithmic, author = {Koby Crammer and Yoram Singer}, year = 2001, title = {On the algorithmic implementation of multiclass kernel-based machines}, journal = {JMLR}, volume = 2, pages = {265--292}, } @InProceedings{ geng07feature, author = {Xiubo Geng and Tie-Yan Liu and Tao Qin and Hang Li}, title = {Feature Selection for Ranking}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, pages = {407--414}, year = 2007, } @Article{ jarvelin02cumulated, author = {Kalervo J{\"a}rvelin and Jaana Kek{\"a}l{\"a}inen}, title = {Cumulated gain-based evaluation of {IR} techniques}, journal = {TOIS}, year = 2002, volume = 20, number = 4, pages = {422--446}, } @Article{ kekalainen02graded, author = {Jaana Kek{\"a}l{\"a}inen and Kalervo J{\"a}rvelin}, title = {Using Graded Relevance Assessments in {IR} Evaluation}, journal = {JASIST}, year = 2002, volume = 53, number = 13, pages = {1120--1129}, } @InProceedings{ burges05learning, author = {Chris Burges and Tal Shaked and Erin Renshaw and Ari Lazier and Matt Deeds and Nicole Hamilton and Greg Hullender}, title = {Learning to rank using gradient descent}, booktitle = {Proc. ICML}, year = 2005, } @InCollection{herbrich00large, author = {Ralf Herbrich and Thore Graepel and Klaus Obermayer}, year = 2000, title = {Large margin rank boundaries for ordinal regression}, booktitle = {Advances in Large Margin Classifiers}, publisher = {MIT Press}, address = {Cambridge, MA}, pages = {115--132}, } @InProceedings{ yue07svm, author = {Yisong Yue and Thomas Finley and Filip Radlinski and Thorsten Joachims}, title = {A Support Vector Method for Optimizing Average Precision}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, year = 2007, } @InProceedings{ taylor06optimisation, author = {Michael Taylor and Hugo Zaragoza and Nick Craswell and Stephen Robertson and Chris Burges}, title = {Optimisation methods for ranking functions with multiple parameters}, booktitle = {Proc. CIKM}, publisher = {ACM Press}, year = 2006, } @InProceedings{ wong88linear, author = {S. K. Michael Wong and Yiyu Yao and Peter Bollmann}, title = {Linear Structure in Information Retrieval}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, year = 1988, pages = {219-232}, } @InProceedings{ gey94inferring, author = {Fredric C. Gey}, title = {Inferring Probability of Relevance Using the Method of Logistic Regression}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, year = 1994, pages = {222--231}, } @InProceedings{ cao06adapting, author = {Yunbo Cao and Jun Xu and Tie-Yan Liu and Hang Li and Yalou Huang and Hsiao-Wuen Hon}, title = {Adapting {R}anking {SVM} to Document Retrieval}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, year = 2006, } @InProceedings{ qin07ranking, author = {Tao Qin and Tie-Yan Liu and Wei Lai and Xu-Dong Zhang and De-Sheng Wang and Hang Li}, title = {Ranking with Multiple Hyperplanes}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, year = 2007, } @incollection{indyk04nearest, author = {Piotr Indyk}, title = {Nearest Neighbors in High-Dimensional Spaces}, booktitle = {Handbook of Discrete and Computational Geometry}, edition = {2nd}, pages = {877--892}, editor = {J. E. Goodman and J. O'Rourke}, publisher = {Chapman and Hall/CRC Press}, address = {New York}, year = {2004} } %% Chris reinserted missing references @article{godoy06modeling, author = {Daniela Godoy and Anal{\'i}a Amandi}, title = {Modeling user interests by conceptual clustering}, journal = {Information Systems}, volume = {31}, number = {4}, year = {2006}, issn = {0306-4379}, pages = {247--265}, doi = {dx.doi.org/10.1016/j.is.2005.02.008}, publisher = {Elsevier Science}, address = {Oxford, UK, UK}, } @inproceedings{fang04formal, author = {Hui Fang and Tao Tao and ChengXiang Zhai}, title = {A formal study of information retrieval heuristics}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, year = {2004}, isbn = {1-58113-881-4}, pages = {49--56}, location = {Sheffield, United Kingdom}, doi = {doi.acm.org/10.1145/1008992.1009004}, address = {New York, NY}, } @article{sauvagnat06answering, author = {Karen Sauvagnat and Mohand Boughanem and Claude Chrisment}, title = {Answering content and structure-based queries on {XML} documents using relevance propagation}, journal = {Information Systems}, volume = {31}, number = {7}, year = {2006}, issn = {0306-4379}, pages = {621--635}, doi = {dx.doi.org/10.1016/j.is.2005.11.007}, publisher = {Elsevier Science}, address = {Oxford, UK}, } @InProceedings{westerveld07tijah, author = {Thijs Westerveld and Henning Rode and Roel van Os and Djoerd Hiemstra and Georgina Ram{\'\i}rez and Vojkan Mihajlovic and Arjen P. de Vries}, title = {Evaluating Structured Information Retrieval and Multimedia Retrieval using {PF/Tijah}}, year = 2007, pages = {104--114}, crossref = {fuhr07comparative}, } @inproceedings{vogt99user, author = "Christopher C. Vogt and Garrison W. Cottrell and Richard K. Belew and Brian T. Bartell", title = "User Lenses -- {A}chieving 100\% Precision on Frequently Asked Questions", year=1999, booktitle = {Proc. International Conference on User Modelling}, } @article{cambazoglu06performance, author = {{Berkant Barla} Cambazoglu and Cevdet Aykanat}, title = {Performance of query processing implementations in ranking-based text retrieval systems using inverted indices.}, journal = {IP\&M}, volume = {42}, number = {4}, year = {2006}, pages = {875-898}, doi = {dx.doi.org/10.1016/j.ipm.2005.06.004}, bibsource = {DBLP, http://dblp.uni-trier.de}, } @inproceedings{fradkin03experiments, author = {Dmitriy Fradkin and David Madigan}, title = {Experiments with random projections for machine learning}, booktitle = {Proc. KDD}, publisher = {ACM Press}, year = {2003}, isbn = {1-58113-737-0}, pages = {517--522}, location = {Washington, D.C.}, doi = {doi.acm.org/10.1145/956750.956812}, address = {New York, NY}, } @inproceedings{bennett99densitybased, author = {Kristin P. Bennett and Usama Fayyad and Dan Geiger}, title = {Density-based indexing for approximate nearest-neighbor queries}, booktitle = {Proc. KDD}, publisher = {ACM Press}, year = {1999}, isbn = {1-58113-143-7}, pages = {233--243}, location = {San Diego, California, United States}, doi = {doi.acm.org/10.1145/312129.312236}, address = {New York, NY}, } @inproceedings{buckley85optimization, author = {Chris Buckley and Alan F. Lewit}, title = {Optimization of inverted vector searches}, booktitle = {Proc. SIGIR}, year = {1985}, isbn = {0-89791-159-8}, pages = {97--110}, location = {Montreal}, doi = {doi.acm.org/10.1145/253495.253515}, publisher = {ACM Press}, address = {New York, NY}, } @inproceedings{guttman84rtrees, author = {Antonin Guttman}, title = {R-trees: {A} dynamic index structure for spatial searching}, booktitle = {Proc. SIGMOD}, publisher = {ACM Press}, year = {1984}, isbn = {0-89791-128-8}, pages = {47--57}, location = {Boston, Massachusetts}, doi = {doi.acm.org/10.1145/602259.602266}, address = {New York, NY}, } @article{fuernkranz02round, author = {Johannes F{\"u}rnkranz}, title = {Round robin classification}, journal = {JMLR}, volume = {2}, year = {2002}, issn = {1533-7928}, pages = {721--747}, publisher = {MIT Press}, address = {Cambridge, MA}, } @inproceedings{forman04learning, author = {George Forman and Ira Cohen}, title = {Learning from Little: {C}omparison of Classifiers Given Little Training}, booktitle = {Proc. PKDD}, year = {2004}, pages = {161-172}, ee = {springerlink.metapress.com/openurl.asp?genre=article{\&}issn=0302-9743{\&}volume=3202{\&}spage=161}, bibsource = {DBLP, http://dblp.uni-trier.de}, } @inproceedings{bottou94convergence, author = {L{\'e}on Bottou and Yoshua Bengio}, title = {Convergence Properties of the K-Means Algorithms}, booktitle = {Proc. NIPS}, year = {1994}, pages = {585-592}, ee = {nips.djvuzone.org/djvu/nips07/0585.djvu}, bibsource = {DBLP, http://dblp.uni-trier.de}, } @article{nigam00em, author = {Kamal Nigam and Andrew Kachites McCallum and Sebastian Thrun and Tom Mitchell}, title = {Text Classification from Labeled and Unlabeled Documents using EM}, journal = {Machine Learning}, volume = {39}, number = {2-3}, year = {2000}, issn = {0885-6125}, pages = {103--134}, publisher = {Kluwer}, address = {Hingham, MA}, } @inproceedings{weiss03web, editor = "Mieczys{\l{}}aw A. K{\l{}}opotek and S{\l{}}awomir T. Wierzcho{{\'n}} and Krzysztof Trojanowski", author = "Dawid Weiss and Jerzy Stefanowski", title = "Web Search Results Clustering in {P}olish: {E}xperimental evaluation of {C}arrot", booktitle = {Proc. New Trends in Intelligent Information Processing and Web Mining Conference}, year = "2003", } @inproceedings{ azcarraga01extracting, author = "Arnulfo P. Azcarraga and Teddy N. {Yap Jr.}", title = "Extracting Meaningful Labels for {WEBSOM} Text Archives", booktitle = {Proc. CIKM}, publisher = {ACM Press}, pages = "41-48", year = 2001, url = "citeseer.ist.psu.edu/azcarraga01extracting.html" , } @inproceedings{roos06compression, author = {Teemu Roos and Tuomas Heikkil{\"a} and Petri Myllym{\"a}ki}, title = {A Compression-Based Method for Stemmatic Analysis}, booktitle = {Proc. ECAI}, year = {2006}, pages = {805-806}, bibsource = {DBLP, http://dblp.uni-trier.de}, } @Article{ hand01idiot, author = {David J. Hand and Keming Yu}, title = {Idiot's {Bayes}: {N}ot So Stupid after All}, year = 2001, journal = {International Statistical Review}, volume = 69, number = 3, pages = {385--398}, } @inproceedings{pavlov04document, author = {Dmitry Pavlov and Ramnath Balasubramanyan and Byron Dom and Shyam Kapur and Jignashu Parikh}, title = {Document Preprocessing For Naive {Bayes} Classification and Clustering with Mixture of Multinomials}, booktitle = {Proc. KDD}, pages = {829-834}, year = {2004}, } @Proceedings{ismir07, title = {International Conference on Music Information Retrieval (ISMIR 2007)}, year = {2007}, editor = {Simon Dixon and David Bainbridge and Rainer Typke}, isbn = {978-3-85403-218}, url = {ismir2007.ismir.net/} } @article{downie06music, author = {J. Stephen Downie}, title = {The {M}usic {I}nformation {R}etrieval {E}valuation e{X}change ({MIREX})}, journal = {D-Lib Magazine}, year = 2006, month = {December}, volume = 12, number = 12, issn = {1082-9873}, } @book{bimbo99visual, author = {del Bimbo, Alberto}, title = {Visual Information Retrieval}, year = {1999}, publisher = {Morgan Kaufmann}, } @book{lew01principles, author = {Michael S. Lew}, title = {Principles of Visual Information Retrieval}, year = {2001}, publisher = {Springer}, } @book{coden02speech, title = {Information Retrieval Techniques for Speech Applications}, editor = {Anni R. Coden and Eric W. Brown and Savitha Srinivasan}, year = 2002, publisher = {Springer}, } @book{lesk04understanding, author = {Michael Lesk}, title = {Understanding Digital Libraries}, year = 2004, edition = {2nd}, publisher = {Morgan Kaufmann} } @article{levenshtein66binary, author = {Vladimir I. Levenshtein}, journal = {Soviet Physics Doklady}, number = {8}, pages = {707--710}, title = {Binary codes capable of correcting deletions, insertions, and reversals}, volume = {10}, year = {1966} } @article{wagner74string, author = {Robert A. Wagner and Michael J. Fischer}, title = {The String-to-String Correction Problem}, journal = {JACM}, volume = {21}, number = {1}, year = {1974}, issn = {0004-5411}, pages = {168--173}, doi = {doi.acm.org/10.1145/321796.321811}, publisher = {ACM Press}, address = {New York, NY, USA}, } @article{teh06hdp, author = {Yee Whye Teh and Michael I. Jordan and Matthew J. Beal and David M. Blei}, title = {Hierarchical {D}irichlet Processes}, journal = {Journal of the American Statistical Association}, year = 2006, volume = 101, number = 476, pages = {1566--1581}, } @inproceedings{wei06lda, author = {Xing Wei and W. Bruce Croft}, title = {{LDA}-based document models for ad-hoc retrieval}, booktitle = {Proc. SIGIR}, year = {2006}, isbn = {1-59593-369-7}, pages = {178--185}, location = {Seattle, Washington, USA}, doi = {doi.acm.org/10.1145/1148170.1148204}, publisher = {ACM Press}, address = {New York, NY, USA}, } @inproceedings{lavrenko01relevance, author = {Lavrenko, Victor and Croft, W. Bruce}, title = {Relevance-based language models}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, year = 2001, pages = {120--127} } @inproceedings{zhai02two, author = {ChengXiang Zhai and John Lafferty}, title = {Two-stage language models for information retrieval}, booktitle = {Proc. SIGIR}, year = {2002}, isbn = {1-58113-561-0}, pages = {49--56}, location = {Tampere, Finland}, doi = {doi.acm.org/10.1145/564376.564387}, publisher = {ACM Press}, address = {New York, NY, USA} } @inproceedings{kraaij02importance, author = {Wessel Kraaij and Thijs Westerveld and Djoerd Hiemstra}, year = 2002, title = {The Importance of Prior Probabilities for Entry Page Search}, booktitle = {Proc. SIGIR}, publisher = {ACM Press}, pages = {27--34} } @inproceedings{tao06language, author = {Tao Tao and Xuanhui Wang and Qiaozhu Mei and ChengXiang Zhai}, title = {Language Model Information Retrieval with Document Expansion}, booktitle = {Proc. Human Language Technology Conference / North American Chapter of the Association for Computational Linguistics}, year = 2006, pages = {407--414} } @incollection{lafferty03probabilistic, author = {John Lafferty and Chengxiang Zhai}, title = {Probabilistic relevance models based on document and query generation}, editor = {W. Bruce Croft and John Lafferty}, booktitle = {Language Modeling for Information Retrieval}, year = 2003, publisher = {Kluwer} } @incollection{kraaij03language, author = {Wessel Kraaij and Martijn Spitters}, year = 2003, title = {Language Models for Topic Tracking}, booktitle = {Language Modeling for Information Retrieval}, editor = {W. B. Croft and J. Lafferty}, pages = {95--124}, publisher = {Kluwer} } @inproceedings{xu99clusterbased, author = {Jinxi Xu and W. Bruce Croft}, title = {Cluster-based language models for distributed retrieval}, booktitle = {Proc. SIGIR}, year = {1999}, isbn = {1-58113-096-1}, pages = {254--261}, location = {Berkeley, California, United States}, doi = {doi.acm.org/10.1145/312624.312687}, publisher = {ACM Press}, address = {New York, NY, USA}, } @article{muresan04topic, author = {Gheorghe Muresan and David J. Harper}, title = {Topic modeling for mediated access to very large document collections}, journal = {JASIST}, volume = {55}, number = {10}, year = {2004}, issn = {1532-2882}, pages = {892--910}, doi = {dx.doi.org/10.1002/asi.20034}, publisher = {John Wiley \& Sons}, address = {New York, NY, USA}, } @inproceedings{kurland04corpus, author = {Oren Kurland and Lillian Lee}, title = {Corpus structure, language models, and ad hoc information retrieval}, booktitle = {Proc. SIGIR}, year = {2004}, isbn = {1-58113-881-4}, pages = {194--201}, location = {Sheffield, United Kingdom}, doi = {doi.acm.org/10.1145/1008992.1009027}, publisher = {ACM Press}, address = {New York, NY, USA}, } @inproceedings{buckley00evaluating, author = {Chris Buckley and Ellen M. Voorhees}, year = 2000, title = {Evaluating Evaluation Measure Stability}, booktitle = {Proc. SIGIR}, pages = {33--40} } @InProceedings{tague-sutcliffe95statistical, author = "Jean Tague-Sutcliffe and James Blustein", title = "A statistical analysis of the {TREC-3} data", booktitle = {Proc. TREC}, pages = "385--398", year = 1995, } @article{sakai07reliability, author = {Tetsuya Sakai}, year = 2007, title = {On the reliability of information retrieval metrics based on graded relevance}, journal = {IP\&M}, volume = 43, number = 2, pages = {531--548} } @inproceedings{zobel98reliable, author = {Justin Zobel}, year = 1998, title = {How reliable are the results of large-scale information retrieval experiments?}, booktitle = {Proc. SIGIR}, pages = {307--314} } @article{schamber90re-examination, author = {Linda Schamber and Michael Eisenberg and Michael S. Nilan}, year = 1990, title = {A re-examination of relevance: toward a dynamic, situational definition}, journal = {IP\&M}, volume = 26, number = 6, pages = {755--776} } @inproceedings{hersh00batch, author = {William R. Hersh and Andrew Turpin and Susan Price and Benjamin Chan and Dale Kraemer and Lynetta Sacherek and Daniel Olson}, title = {Do batch and user evaluation give the same results?}, booktitle = {Proc. SIGIR}, YEAR = 2000, pages = {17--24} } @inproceedings{hersh00further, author = {William R. Hersh and Andrew Turpin and Lynetta Sacherek and Daniel Olson and Susan Price and Benjamin Chan and Dale Kraemer}, title = {Further Analysis of Whether Batch and User Evaluations Give the Same Results with a Question-Answering Task}, booktitle = {Proc. TREC}, year = 2000 } @article{hersh01challenging, author = {William R. Hersh and Andrew Turpin and Susan Price and Dale Kraemer and Daniel Olson and Benjamin Chan and Lynetta Sacherek}, title = {Challenging conventional assumptions of automated information retrieval with real users: Boolean searching and batch retrieval evaluations}, journal = {IP\&M}, volume = 37, number = 3, pages = {383--402}, year = 2001 } @inproceedings{turpin01why, author = {Andrew Turpin and William R. Hersh}, title = {Why Batch and User Evaluations Do Not Give the Same Results}, booktitle = {Proc. SIGIR}, year = 2001, pages = {225--231} } @inproceedings{turpin02user, author = {Andrew Turpin and William R. Hersh}, title = {User interface effects in past batch versus user experiments}, booktitle = {Proc. SIGIR}, year = 2002, pages = {431--432} } @incollection{dietterich01ensemble, author = {Dietterich, T.G.}, year = 2001, title = {Ensemble methods in machine learning}, editor = {Kittler, Josef and Roli, Fabio}, booktitle = {Multiple Classifier Systems}, series = {LNCS}, volume = 1857, publisher = {Springer}, pages = {1-–15} } @incollection{dietterich02ensemble, author = {Thomas G. Dietterich}, title = {Ensemble Learning}, booktitle = {The Handbook of Brain Theory and Neural Networks}, editor = {Michael A. Arbib}, publisher = {MIT Press}, edition = {2nd}, year = 2002 } @incollection{schapire03boosting, author = {Robert E. Schapire}, title = {The boosting approach to machine learning: An overview}, editor = {D. D. Denison and M. H. Hansen and C. Holmes and B. Mallick and B. Yu}, booktitle = {Nonlinear Estimation and Classification}, publisher = {Springer}, year = 2003 } @article{schapire00boostexter, author = {Robert E. Schapire and Yoram Singer}, title = {BoosTexter: A boosting-based system for text categorization}, journal = {Machine Learning}, volume = 39, number = {2/3}, pages = {135--168}, year = 2000 } @book{chapelle06semi-supervised, editor = {Olivier Chapelle and Bernhard Sch{\"o}lkopf and Alexander Zien}, year = 2006, title = {Semi-Supervised Learning}, publisher = {MIT Press}, address = {Cambridge, MA} } @incollection{nigam06semi-supervised, author = {Kamal Nigam and Andrew McCallum and Tom Mitchell}, title = {Semi-supervised Text Classification Using {EM}}, crossref = {chapelle06semi-supervised}, pages = {33--56} } @incollection{joachims06transductive, author = {Thorsten Joachims}, title = {Transductive Support Vector Machines}, crossref = {chapelle06semi-supervised}, pages = {105--118} } @article{tong01svm, author = {Simon Tong and Daphne Koller}, year = 2001, title = {Support Vector Machine Active Learning with Applications to Text Classification}, journal = {JMLR}, volume = 2, pages = {45-66} } @inproceedings{baldridge04active, title = {Active learning and the total cost of annotation}, author = {Jason Baldridge and Miles Osborne}, booktitle = {Proc. Empirical Methods in Natural Language Processing}, pages = {9--16}, year = {2004}, abstract = {Active learning (AL) promises to reduce the cost of annotating labeled datasets for trainable human language technologies. Contrary to expectations, when creating labeled training material for HPSG parse selection and later reusing it with other models, gains from AL may be negligible or even negative. This has serious implications for using AL, showing that additional cost-saving strategies may need to be adopted. We explore one such strategy: using a model during annotation to automate some of the decisions. Our best results show an 80% reduction in annotation cost compared with labeling randomly selected data with a single model.} } @inproceedings{sindhwani06large, author = {Sindhwani, V. and Keerthi, S. S.}, year = 2006, title = {Large scale semi-supervised linear {SVMs}}, booktitle = {Proc. SIGIR}, pages = {477--484} } @inproceedings{richardson06beyond, author = {Richardson, M. and Prakash, A. and Brill, E.}, year = 2006, title = {Beyond {P}age{R}ank: machine learning for static ranking}, booktitle = {Proc. WWW}, pages = {707--715} } @Article{ altingovde08incremental, author = {Ismail Seng{\"o}r Alting{\"o}vde and Engin Demir and Fazli Can and {\"O}zg{\"u}r Ulusoy}, title = {Incremental cluster-based retrieval using compressed cluster-skipping inverted files}, year = {2008}, journal = {TOIS}, note = {To appear}, } @inproceedings{carterette08evaluating, author = {Ben Carterette and Rosie Jones}, title = {Evaluating Search Engines by Modeling the Relationship Between Relevance and Clicks}, booktitle = {Proc. NIPS}, year = 2008 } @inproceedings{metzler05markov, author = {Donald Metzler and W. Bruce Croft}, title = {A {M}arkov random field model for term dependencies}, booktitle = {Proc. SIGIR}, year = 2005, pages = {472--479} } @book{cord08ml, author = {Matthieu Cord and P{\'a}draig Cunningham}, title = {Machine Learning Techniques for Multimedia: Case Studies on Organization and Retrieval}, year = 2008, publisher = {Springer} } @inproceedings{schutze06thresholding, author={Hinrich Sch{\"u}tze and Emre Velipasaoglu and Jan Pedersen}, title={Performance thresholding in practical text classification}, booktitle={ACM CIKM}, year=2006 } @article{dice45:measures, author = {L. R. Dice}, title = {Measures of the amount of ecologic association between species}, year = 1945, journal = {Journal of Ecology}, volume =26, pages = {297--302} } @inproceedings{dhillon02enhanced, author = {Inderjit S. Dhillon and Subramanyam Mallela and Rahul Kumar}, title = {Enhanced word clustering for hierarchical text classification}, booktitle = {KDD '02: Proceedings of the eighth ACM SIGKDD international conference on Knowledge discovery and data mining}, year = {2002}, isbn = {1-58113-567-X}, pages = {191--200}, location = {Edmonton, Alberta, Canada}, doi = {http://doi.acm.org/10.1145/775047.775076}, publisher = {ACM}, address = {New York, NY, USA}, }