diff --git a/src/main/xar-resources/data/lucene/listings/listing-50.txt b/src/main/xar-resources/data/lucene/listings/listing-50.txt new file mode 100644 index 00000000..2040ad47 --- /dev/null +++ b/src/main/xar-resources/data/lucene/listings/listing-50.txt @@ -0,0 +1,10 @@ + + + + + + + + + + \ No newline at end of file diff --git a/src/main/xar-resources/data/lucene/listings/listing-51.txt b/src/main/xar-resources/data/lucene/listings/listing-51.txt new file mode 100644 index 00000000..6b03666f --- /dev/null +++ b/src/main/xar-resources/data/lucene/listings/listing-51.txt @@ -0,0 +1,12 @@ + + + + + + + + + + + + \ No newline at end of file diff --git a/src/main/xar-resources/data/lucene/listings/listing-52.txt b/src/main/xar-resources/data/lucene/listings/listing-52.txt new file mode 100644 index 00000000..58042ff9 --- /dev/null +++ b/src/main/xar-resources/data/lucene/listings/listing-52.txt @@ -0,0 +1,11 @@ + + + + + + + + + + + \ No newline at end of file diff --git a/src/main/xar-resources/data/lucene/listings/listing-53.txt b/src/main/xar-resources/data/lucene/listings/listing-53.txt new file mode 100644 index 00000000..ce651b07 --- /dev/null +++ b/src/main/xar-resources/data/lucene/listings/listing-53.txt @@ -0,0 +1 @@ +//db:article[ft:query(., "title:(xquery AND language) AND xml")] \ No newline at end of file diff --git a/src/main/xar-resources/data/lucene/listings/listing-54.txt b/src/main/xar-resources/data/lucene/listings/listing-54.txt new file mode 100644 index 00000000..d67170d6 --- /dev/null +++ b/src/main/xar-resources/data/lucene/listings/listing-54.txt @@ -0,0 +1,4 @@ +for $article in collection("/db/articles")//db:article[ft:query(., "xquery", map { "fields": ("title", "author") })] +order by ft:field($article, "title"), ft:field($article, "author")[1] +return + $article \ No newline at end of file diff --git a/src/main/xar-resources/data/lucene/listings/listing-55.txt b/src/main/xar-resources/data/lucene/listings/listing-55.txt new file mode 100644 index 00000000..e04b4bc2 --- /dev/null +++ b/src/main/xar-resources/data/lucene/listings/listing-55.txt @@ -0,0 +1 @@ +//db:article[ft:query(., (), map { "fields": ("title", "author") })] \ No newline at end of file diff --git a/src/main/xar-resources/data/lucene/listings/listing-56.txt b/src/main/xar-resources/data/lucene/listings/listing-56.txt new file mode 100644 index 00000000..fb47e355 --- /dev/null +++ b/src/main/xar-resources/data/lucene/listings/listing-56.txt @@ -0,0 +1,10 @@ +let $result := collection("/db/articles")//db:article[ft:query(., "xml")] +let $facets := ft:facets($result, "keyword", ()) +return + + { + map:for-each($facets, function($label, $count) { + + }) + } +
{$label}{$count}
\ No newline at end of file diff --git a/src/main/xar-resources/data/lucene/listings/listing-57.txt b/src/main/xar-resources/data/lucene/listings/listing-57.txt new file mode 100644 index 00000000..332a7408 --- /dev/null +++ b/src/main/xar-resources/data/lucene/listings/listing-57.txt @@ -0,0 +1,7 @@ +let $options := map { + "facets": map { + "keyword": ("indexing", "facets") + } +} +return + collection("/db/articles")//db:article[ft:query(., "xml", $options)] \ No newline at end of file diff --git a/src/main/xar-resources/data/lucene/listings/listing-58.xml b/src/main/xar-resources/data/lucene/listings/listing-58.xml new file mode 100644 index 00000000..a90136c4 --- /dev/null +++ b/src/main/xar-resources/data/lucene/listings/listing-58.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/src/main/xar-resources/data/lucene/listings/listing-59.xml b/src/main/xar-resources/data/lucene/listings/listing-59.xml new file mode 100644 index 00000000..44f78948 --- /dev/null +++ b/src/main/xar-resources/data/lucene/listings/listing-59.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/src/main/xar-resources/data/lucene/lucene.xml b/src/main/xar-resources/data/lucene/lucene.xml index 229f8aea..980f0aac 100644 --- a/src/main/xar-resources/data/lucene/lucene.xml +++ b/src/main/xar-resources/data/lucene/lucene.xml @@ -366,15 +366,115 @@ - - - Defining Fields - - Sometimes you want to define different Lucene indexes on the same set of - elements, for instance to use a different analyzer. eXist-db allows to - name a certain index using the field attribute: - - Such an index is called named index. See on how to query these indexes. + + Facets and Fields + Starting with eXist 5.0, an index configuration may define additional facets and fields. Both can hold arbitrary content, which will be + attached to the indexed parent node and can be used to further refine a query, sort results or display additional information to the + user: + + + facet + + a facet defines a concept or information item by which the indexed + items can be grouped. Typical facets would be categories taken from some + pre-defined taxonomy, languages, dates, places or names occurring in a + text corpus. The goal is to enable users to "drill down" into a + potentially large result set by selecting from a list of facets + displayed to them. For example, if you shop for a laptop, you are often + presented with a list of facets with which you may restrict your result + by CPU type, memory or screen size etc. As you select facets, the result + set will become smaller and smaller. + Facets are always pre-defined at indexing time, so the drill down is + very fast. They are meant for refining other queries, the assumption + always is that the user selects one or more facets from a list of facet + values associated with the current query results. + + + + field + + a field contains additional, searchable content attached to an indexed + parent node. In many cases fields will contain constructed content which + is not directly found in the indexed XML or requires costly computation. + For example, determining publication dates or author names for a set of + articles may require some pre-processing which may be too expensive at + query time. A field allows you to pre-compute those information items at + indexing time. + Fields can be queried in the same expression as the parent node, + resulting in fast response times. Their content can optionally be stored + to speed up display or sorting. Fields may also use a different analyzer + than the parent node, which allows e.g. multiple languages to be handled + separately. + + + + + Facet and Field Configuration + Facets and fields are configured in a similar way. Both should appear nested inside the parent index element they are attached to. Let's + assume we have a collection of articles written in docbook. Each article will have a top-level info element describing the + article. Each info element contains a title, one or more authors and a list of keywords in + keywordset. + Keywords are a perfect candidate for a facet, so let's start with it: + + Every facet needs to have a dimension attribute, defining the name of the facet dimension the items will be added to. The + values associated with this facet dimension are determined by the expression attribute: it may contain an arbitary XQuery + expression rooted in the parent node being indexed. In the example the parent will be a db:article element, so the context item + for the expression is set to this element. + The expression is evaluated and for each result item, a facet value is added + to the dimension using the string value of the item. If the expression returns + the empty sequence for the current parent node, the corresponding facet will be + empty as well. + A facet can also be defined to be hierarchical. A typical example would be a date, which consists of a year, month and day component. By + indexing the single components as separate parts of a hierarchical facet, we enable the user to drill down by year first, then by month and + finally by day. Let's assume each of our docbook articles has a pubdate containing a date in xs:date + format: + + Next, we may want to define fields for the authors and title of the article. In docbook, author can be a complex element, + consisting e.g. of a personname with nested + surname and firstname. For display to the user and sorting we want to pre-compute a normalized string out of those + components: + + A field does not need to define an expression attribute though: if no expression is given, the field's content will be taken + from the parent element. This makes sense e.g. if you would like to index a node twice, e.g. using a different analyzer. Or you can specify + index="no" on the parent element and index its content with an explicit field. + A field may use a different analyzer than the one used to index the parent + content. Analyzers are referenced through analyzer attribute as described above. + Typed fields: fields may also declare a type attribute: supported values + are atomic types like xs:date, xs:dateTime, + xs:time, xs:integer, + xs:decimal and their sub-types. Defining a type is + important with respect to sorting (see below), e.g. to get dates in the correct + order. Typed fields can also be retrieved into corresponding XQuery atomic + values, so no additional casting is necessary. However, typed fields cannot be + queried using Lucene's default query parser, only retrieved with + ft:field. + Storing fields: By default the complete content of a field is stored in the Lucene index, + allowing later fast retrieval of the content using + ft:field. You can disable storing the content by adding + attribute store="no". The field will still be indexed and + available for queries though. + + Importing external modules: as can be seen in the field definition for "author" above, expressions can easily become quite verbose, so writing them into an attribute + is not convenient. It is thus also possible to import one or more XQuery modules into the index configuration and use the functions declared + in the module: + + In this example we extract the code for computing the author field into a function idx:authors located in an XQuery + module, module.xql. Note that we're using a relative import path for the module in the at attribute. + The path will be resolved relative to the collection to which the collection configuration applies (not where the collection configuration + itself is stored). It is also important that the module and all dependencies it imports is stored before the collection + configuration is saved and indexing starts. + Conditions: sometimes you may want to create a field only if a certain condition is met. For this + purpose, an additional attribute if may be added, containing an XPath expression. If the expression evaluates to an + effective boolean value of true, the field will be created. Otherwise it is skipped. + Conditions are useful to e.g. distinguish between different languages and apply an appropriate analyzer to each. Let's assume our docbook + articles may have both, a German and English version. The language is indicated by the @xml:lang attribute on the top-level + section element. We thus create a separate field for each language and connect it to the analyzer appropriate for the + language: + + Note that we skip indexing the parent article element with index="no" because we do not want a default index, but + rather a separate field for each language, so we can target them in queries explicitely. + @@ -382,119 +482,166 @@ Querying the Index - Querying full text from XQuery is straightforward. For example: - - The query function takes a query string in Lucene's default query syntax. It returns a set of nodes which are relevant with respect to the - query. Lucene assigns a relevance score or rank (a decimal number) to each match. This - score is preserved by eXist-db and can be accessed through the score function. - The higher the score, the more relevant the text. You can use Lucene's features to - "boost" a certain term in the query: give it a higher or lower influence on the final - rank. - Please note that the score is computed relative to the root context of the index. If - you created an index on SPEECH, all scores will be computed based on text in - SPEECH nodes, even though your actual query may only return LINE - children of SPEECH. - The Lucene module is fully supported by eXist-db's query-rewriting optimizer. This - means that the query engine can rewrite the XQuery expression to make best use of the - available indexes. All the rules and hints given in the tuning guide fully apply to the Lucene index. - To present search results in a Keywords in Context format, you - may want to have a look at eXist-db's KWIC - module. - - - - - Query a Named Index - - To query a named index (see ), use the - ft:query-field($fieldName, $query) instead of - ft:query: - ft:query-field("title", "xml") - ft:query-field works exactly like ft:query, - except that the set of nodes to search is determined by the nodes in the named - index. The function returns the nodes selected by the query, which would be - title elements in the example above. - You can use ft:query-field with an XPath filter expression, - just as you would call ft:query: - //section[ft:query-field("title", "xml")] + The query function takes a query string in Lucene's default query syntax. It returns a set of nodes which are relevant with respect to the query. Lucene assigns a relevance score or rank (a + decimal number) to each match. This score is preserved by eXist-db and can be accessed through the score function. + The higher the score, the more relevant the text. You can use Lucene's features to "boost" a certain term in the query: give it a higher or lower + influence on the final rank. + Please note that the score is computed relative to the root context of the index. If you created an index on SPEECH, all scores will be + computed based on text in SPEECH nodes, even though your actual query may only return LINE children of + SPEECH. + The query string passed to ft:query may be empty. In this case all items from the context sequence are matched and returned. + Using an empty query makes sense in combination with the options for retrieving facets and field values described below. + The Lucene module is fully supported by eXist-db's query-rewriting optimizer. This means that the query engine can rewrite the XQuery expression + to make best use of the available indexes. All the rules and hints given in the tuning guide fully apply to the + Lucene index. + To present search results in a Keywords in Context format, you may want to have a look at eXist-db's KWIC module. + + Querying Fields + Fields associated with the indexed parent node (see above) can be queried with + ft:query by prefixing parts of the query expression with the + field name followed by a colon (':') as described in the documentation for Lucene's + default query syntax. For example, the following expression searches for a docbook + article containing the terms "xml" in the text and "xquery language" in the + title: + + Note how subexpressions can be grouped with parentheses to clearly state to which + field they apply. + + + Retrieving Field Content + You can retrieve the content of a field for display or sorting purposes using the + ft:field function. However, fields are always bound to the + result of a full text query, so you cannot retrieve them without calling + ft:query first. Also, keeping track of fields can be + expensive, so ft:query needs to be explicitly passed a parameter + specifying which fields should be made available via the query result. The third, + optional, parameter to ft:query supports a + fields option listing the names of the fields to be provided. + The values of the associated field can then be retrieved for each item in the query + result using ft:field. + One of the most common uses for retrieving field contents will be for sorting the + results of a query. The order by in the example below sorts results by + title first and then by author. + + Note that even though fields are only available with the results of the ft:query, + it is still possible to use them for sorting and displaying the whole available data + set. For example, to view all articles in the collection you could pass in an empty + sequence in place of the query string like this: + + Typed fields: If you declared a different type than xs:string on a field, you should + remember to use the 3-parameter variant of ft:field and pass in the name of the desired target type as 3rd parameter. Reason: + lucene basically stores all non-text data types as numbers and eXist has no way to figure out the original type of the field. So if you defined + a field with type xs:date, make sure to retrieve it with ft:field($node, "date", "xs:date"), otherwise all + you get is a number. + + + Displaying Facet Counts + Facet counts for the query result can be retrieved if facets are associated with + an indexed parent element. Facet counts for a particular dimension are available as + a map containing an entry for each facet value occurring in one or more items of the + query result. The map links the facet value given as a map key with a positive count + corresponding to the number of times the value occurs in the result set. Facet + values with zero count are never included. + For example, we may use the following query to display the facet counts for the "keyword" dimension in our set of docbook articles: + + Function ft:facets expects a sequence of nodes belonging to a result set obtained from one or more calls to + ft:query. If the sequence was combined from multiple expressions calling ft:query, the facet counts + will be merged. Second parameter of ft:facets specifies the dimension for which facet counts should be retrieved. The third + parameter should be either empty sequence or a positive integer denoting the maximum number of facets to show. In the case it is smaller than + the total number of facets, only those with the highest counts are returned. Passing an empty sequence means that all facet value counts should + be shown. Please note that facets with a zero occurrence count (i.e. facets not appearing anywhere in the result) are never returned. + For hierarchical facets only the top-most facet value in the hierarchy will be + returned by default. For example, if you indexed a date facet with separate year, + month and day component, a call to ft:facets($node, "date", ()) will + return facet counts for years only. To also get counts for months, you have to call + ft:facets with a fourth parameter, passing in the year for + which sub-facet counts should be retrieved. To get days, you also need to specify + month and so on. ft:facets($node, "date", (), ("2018", "06")) will thus + return facet counts for all days in June 2018. + + + Refining a Query with Facets + The main purpose of facets is to quickly narrow down a query result, limiting it + to only items which match a certain facet value. To drill down by a given facet + dimension and value, pass a key "facets" in the options map given in the third + parameter of ft:query: + + If you specify more than one value for one facet dimension, these will be linked + together with a logical or, thus returning elements matching any + of the alternative facet values for that dimension. If you specify multiple + dimensions, they are treated as an and, limiting the result to + elements matching both dimensions. - - - Describing Queries in XML - - Lucene's default query syntax does not provide access to all available features. - However, eXist-db's ft:query function also accepts a description - of the query in XML, as an alternative to passing a query string. The XML - description closely mirrors Lucene's query API. It is transformed into an internal - tree of query objects, which is directly passed to Lucene for execution. This has - several advantages, for example you can specify if the order of terms should be - relevant for a phrase query: - + Lucene's default query syntax does not provide access to all available features. However, eXist-db's ft:query function also + accepts a description of the query in XML, as an alternative to passing a query string. The XML description closely mirrors Lucene's query API. + It is transformed into an internal tree of query objects, which is directly passed to Lucene for execution. This has several advantages, for + example you can specify if the order of terms should be relevant for a phrase query: - The following elements may occur within a query description: - term + + term + - Defines a single term to be searched in the index. If the root query - element contains a sequence of term elements, wrap them in - bool/ and they will be combined as in a boolean "or" query. - For example: + Defines a single term to be searched in the index. If the root query element contains a sequence of term elements, wrap them in + bool/ and they will be combined as in a boolean "or" query. For example: - This finds all SPEECH elements containing either - nation or miserable or both. + This finds all SPEECH elements containing either nation or miserable or both. - wildcard + + wildcard + - A string with a * wildcard in it. This will be matched - against the terms of a document. Can be used instead of a - term element. For example: + A string with a * wildcard in it. This will be matched against the terms of a document. Can be used instead of a + term element. For example: - regex + + regex + - A regular expression which will be matched against the terms of a - document. Can be used instead of a term element. For + A regular expression which will be matched against the terms of a document. Can be used instead of a term element. For example: - bool + + bool + - Constructs a boolean query from its children. Each child element may - have an occurrence indicator, which could be either - must, should or - not: + Constructs a boolean query from its children. Each child element may have an occurrence indicator, which could be either + must, should or not: must - this part of the query must be - matched + this part of the query must be matched should - this part of the query should be - matched, but doesn't need to + this part of the query should be matched, but doesn't need to not - this part of the query must not be - matched + this part of the query must not be matched @@ -503,99 +650,80 @@ - phrase + + phrase + - Searches for a group of terms occurring in the correct order. The - element may either contain explicit term elements or text - content. Text will be automatically tokenized into a sequence of terms. - For example: + Searches for a group of terms occurring in the correct order. The element may either contain explicit term elements or + text content. Text will be automatically tokenized into a sequence of terms. For example: This has the same effect as: - The attribute slop can be used for a proximity - search: Lucene will try to find terms which are within the specified - distance: + The attribute slop can be used for a proximity search: Lucene will try to find terms which are within the + specified distance: - near + + near + - near is a powerful alternative to phrase and - one of the features not available through the standard Lucene query - parser. - If the element has text content only, it will be tokenized into terms - and the expression behaves like phrase. Otherwise it may - contain any combination of term, first and nested - near elements. This makes it possible to search for two - sequences of terms which are within a specific distance. For - example: + + near is a powerful alternative to phrase and one of the features not available through the standard Lucene + query parser. + If the element has text content only, it will be tokenized into terms and the expression behaves like phrase. Otherwise + it may contain any combination of term, first and nested near elements. This makes it possible to + search for two sequences of terms which are within a specific distance. For example: - Element first matches a span against the start of the text - in the context node. It takes an optional attribute - end to specify the maximum distance from the start of - the text. For example: + Element first matches a span against the start of the text in the context node. It takes an optional attribute + end to specify the maximum distance from the start of the text. For example: - As shown above, the content of first can again be text, a - term or near. - Contrary to phrase, near can be told to ignore - the order of its components. Use parameter - ordered="yes|no" to change near's behaviour. For - example: + As shown above, the content of first can again be text, a term or near. + Contrary to phrase, near can be told to ignore the order of its components. Use parameter + ordered="yes|no" to change near's behaviour. For example: - All elements in a query may have an optional boost parameter - (float). The score of the nodes matching the corresponding query part will be - multiplied by this factor. + All elements in a query may have an optional boost parameter (float). The score of the nodes matching the corresponding + query part will be multiplied by this factor. - - Additional parameters - - The ft:query function allows a third parameter for passing additional - settings to the query engine. This parameter must be an XML fragment which lists the - configuration properties to be set as child elements: - + The ft:query function allows a third parameter for passing additional settings to the query engine. This parameter must be an XML + fragment which lists the configuration properties to be set as child elements: - The meaning of those properties is as follows filter-rewrite - Controls how terms are expanded for wildcard or regular expression - searches. If set to yes, Lucene will use a filter to - pre-process matching terms. If set to no, all matching - terms will be added to a single boolean query which is then executed. - This may generate a "too many clauses" exception when applied to large - data sets. Setting filter-rewrite to yes avoids those - issues. + Controls how terms are expanded for wildcard or regular expression searches. If set to yes, Lucene will use a filter + to pre-process matching terms. If set to no, all matching terms will be added to a single boolean query which is then + executed. This may generate a "too many clauses" exception when applied to large data sets. Setting filter-rewrite to + yes avoids those issues. default-operator - The default operator with which multiple terms will be combined. - Allowed values: or, and. + The default operator with which multiple terms will be combined. Allowed values: or, and. phrase-slop - Sets the default slop for phrases. If 0, then exact - phrase matches are required. Default value is 0. + Sets the default slop for phrases. If 0, then exact phrase matches are required. Default value is + 0. leading-wildcard - When set to yes, * or ? are - allowed as the first character of a PrefixQuery and WildcardQuery. Note - that this can produce very slow queries on big indexes. + When set to yes, * or ? are allowed as the first character of a PrefixQuery and + WildcardQuery. Note that this can produce very slow queries on big indexes. @@ -612,6 +740,10 @@ extracted from for instance a PDF to the binary document. It works equally well for XML documents though and is an efficient method to attach computed fields to a document, containing information which does not exist in the XML as such. + + With the advent of ) functionality it is + recommended to use these instead of constructed fields. + The field indexes are not configured via collection.xconf. Instead we add fields programmatically from an XQuery (which could be run via a trigger): @@ -632,4 +764,4 @@ function: ft:remove-index("/db/demo/test.xml") - \ No newline at end of file +