ElasticSearch - different result ordering for simple request and aggregation request (NEST)
Asked Answered
H

1

7

I have a search page which contains two search result types: summary result and concrete result.

  • Summary result page contains top 3 result per category (top hits)
  • Concrete result page contains all result for a selected category.

To obtain the Summary page I use the request:

var searchDescriptor = new SearchDescriptor<ElasticType>();
searchDescriptor.Index("index_name")
    .Query(q =>
      q.MultiMatch(m => m
        .Fields(fs => fs
          .Field(f => f.Content1, 3)
          .Field(f => f.Content2, 2)
          .Field(f => f.Content3, 1))
        .Fuzziness(Fuzziness.EditDistance(1))
        .Query(query)
        .Boost(1.1)
        .Slop(2)
        .PrefixLength(1)
        .MaxExpansions(100)
        .Operator(Operator.Or)
        .MinimumShouldMatch(2)
        .FuzzyRewrite(RewriteMultiTerm.ConstantScoreBoolean)
        .TieBreaker(1.0)
        .CutoffFrequency(0.5)
        .Lenient()
        .ZeroTermsQuery(ZeroTermsQuery.All))
    && (q.Terms(t => t.Field(f => f.LanguageId).Terms(1)) || q.Terms(t => t.Field(f => f.LanguageId).Terms(0))))
    .Aggregations(a => a
      .Terms("category", tagd => tagd
        .Field(f => f.Category)
        .Size(10)
        .Aggregations(aggs => aggs.TopHits("top_tag_hits", t => t.Size(3)))))
    .FielddataFields(fs => fs
      .Field(p => p.Content1, 3)
      .Field(p => p.Content2, 2)
      .Field(p => p.Content3, 1));

var elasticResult = _elasticClient.Search<ElasticType>(_ => searchDescriptor);

And I get result, for example

{
    "aggregations": {
        "category": {
            "doc_count_error_upper_bound": 0,
            "sum_other_doc_count": 0,
            "buckets": [{
                "key": "category1",
                "doc_count": 40,
                "top_tag_hits": {
                    "hits": {
                        "total": 40,
                        "max_score": 5.4,
                        "hits": [{
                            "_index": "...",
                            "_type": "...",
                            "_id": "...",
                            "_score": 5.4,
                            "_source": {
                                "id": 1
                            }
                        },
                        {
                            "_index": "...",
                            "_type": "...",
                            "_id": "...",
                            "_score": 4.3,
                            "_source": {
                                "id": 3 // FAIL!
                            }
                        },
                        {
                            "_index": "...",
                            "_type": "...",
                            "_id": "...",
                            "_score": 4.3,
                            "_source": {
                                "id": 2
                            }
                        }]
                    }
                }
            }]
        }
    }
}

So i get few hits with the same _score.

To obtain the concrete result (by category) page I use the request:

var searchDescriptor = new SearchDescriptor<ElasticType>();
searchDescriptor.Index("index_name")
    .Size(perPage <= 0 ? 100 : perPage)
    .From(page * perPage)
    .Query(q => q
      .MultiMatch(m => m
         .Fields(fs => fs
           .Field(f => f.Content1, 3)
           .Field(f => f.Content2, 2)
           .Field(f => f.Content3, 1)
           .Field(f => f.Category))
         .Fuzziness(Fuzziness.EditDistance(1))
         .Query(searchRequest.Query)
         .Boost(1.1)
         .Slop(2)
         .PrefixLength(1)
         .MaxExpansions(100)
         .Operator(Operator.Or)
         .MinimumShouldMatch(2)
         .FuzzyRewrite(RewriteMultiTerm.ConstantScoreBoolean)
         .TieBreaker(1.0)
         .CutoffFrequency(0.5)
         .Lenient()
         .ZeroTermsQuery(ZeroTermsQuery.All))
      && q.Term(t => t.Field(f => f.Category).Value(searchRequest.Category))
      && (q.Terms(t => t.Field(f => f.LanguageId).Terms(1)) || q.Terms(t => t.Field(f => f.LanguageId).Terms(0))))
    .FielddataFields(fs => fs
      .Field(p => p.Content1, 3)
      .Field(p => p.Content2, 2)
      .Field(p => p.Content3, 1))
    .Aggregations(a => a
      .Terms("category", tagd => tagd
        .Field(f => f.Category)));

And the result something like this:

{
    "hits": {
        "total": 40,
        "max_score": 7.816723,
        "hits": [{
            "_index": "...",
            "_type": "...",
            "_id": "...",
            "_score": 7.816723,
            "_source": {
                "id": 1
            }
        },
        {
            "_index": "...",
            "_type": "...",
            "_id": "...",
            "_score": 6.514713,
            "_source": {
                "id": 2
            }
        },
        {
            "_index": "...",
            "_type": "...",
            "_id": "...",
            "_score": 6.514709,
            "_source": {
                "id": 3
            }
        }]
    }
}

And so in the second case, for a specific category I get the _score with great precision and elastic can easily sort the results correctly. But in the case of aggregation there are results with the same _score, and in this case, the sorting is not clear how it works.

Can someone direct me to the right path how to solve this problem? or how can I achieve the same order in the results? Maybe I can increase the accuracy for the aggregated results?

I use elasticsearch server version "5.3.0" and NEST library version "5.0.0".

Update: Native query for aggregation request:

{
    "fielddata_fields": [
        "content1^3",
        "content2^2",
        "content3^1"
    ],
    "aggs": {
        "category": {
            "terms": {
                "field": "category",
                "size": 10
            },
            "aggs": {
                "top_tag_hits": {
                    "top_hits": {
                        "size": 3
                    }
                }
            }
        }
    },
    "query": {
        "bool": {
            "must": [
                {
                    "multi_match": {
                        "boost": 1.1,
                        "query": "sparta",
                        "fuzzy_rewrite": "constant_score_boolean",
                        "fuzziness": 1,
                        "cutoff_frequency": 0.5,
                        "prefix_length": 1,
                        "max_expansions": 100,
                        "slop": 2,
                        "lenient": true,
                        "tie_breaker": 1.0,
                        "minimum_should_match": 2,
                        "operator": "or",
                        "fields": [
                            "content1^3",
                            "content2^2",
                            "content3^1"
                        ],
                        "zero_terms_query": "all"
                    }
                },
                {
                    "bool": {
                        "should": [
                            {
                                "terms": {
                                    "languageId": [
                                        1
                                    ]
                                }
                            },
                            {
                                "terms": {
                                    "languageId": [
                                        0
                                    ]
                                }
                            }
                        ]
                    }
                }
            ]
        }
    }
}

Native query for concrete request:

{
    "from": 0,
    "size": 100,
    "fielddata_fields": [
        "content1^3",
        "content2^2",
        "content3^1"
    ],
    "aggs": {
        "category": {
            "terms": {
                "field": "category"
            }
        }
    },
    "query": {
        "bool": {
            "must": [
                {
                    "bool": {
                        "must": [
                            {
                                "multi_match": {
                                    "boost": 1.1,
                                    "query": ".....",
                                    "fuzzy_rewrite": "constant_score_boolean",
                                    "fuzziness": 1,
                                    "cutoff_frequency": 0.5,
                                    "prefix_length": 1,
                                    "max_expansions": 100,
                                    "slop": 2,
                                    "lenient": true,
                                    "tie_breaker": 1.0,
                                    "minimum_should_match": 2,
                                    "operator": "or",
                                    "fields": [
                                        "content1^3",
                                        "content2^2",
                                        "content3^1",
                                        "category"
                                    ],
                                    "zero_terms_query": "all"
                                }
                            },
                            {
                                "term": {
                                    "category": {
                                        "value": "category1"
                                    }
                                }
                            }
                        ]
                    }
                },
                {
                    "bool": {
                        "should": [
                            {
                                "terms": {
                                    "languageId": [
                                        1
                                    ]
                                }
                            },
                            {
                                "terms": {
                                    "languageId": [
                                        0
                                    ]
                                }
                            }
                        ]
                    }
                }
            ]
        }
    }
}

Also i use next mapping for creating index:

var descriptor = new CreateIndexDescriptor(indexName)
    .Mappings(ms => ms
     .Map<ElasticType>(m => m
       .Properties(ps => ps
         .Keyword(s => s.Name(ecp => ecp.Title))
         .Text(s => s.Name(ecp => ecp.Content1))
         .Text(s => s.Name(ecp => ecp.Content2))
         .Text(s => s.Name(ecp => ecp.Content3))
         .Date(s => s.Name(ecp => ecp.Date))
         .Number(s => s.Name(ecp => ecp.LanguageId).Type(NumberType.Integer))
         .Keyword(s => s.Name(ecp => ecp.Category))
         .Text(s => s.Name(ecp => ecp.PreviewImageUrl).Index(false))
         .Text(s => s.Name(ecp => ecp.OptionalContent).Index(false))
         .Text(s => s.Name(ecp => ecp.Url).Index(false)))));

    _elasticClient.CreateIndex(indexName, _ => descriptor);
Houseclean answered 2/6, 2017 at 11:37 Comment(8)
you could sort by date or something like that what important for the result by adding query .Sort()Wychelm
@eyildiz, But I don't need sorting on some field, I need to keep the default order by field _score.Houseclean
Sort by _score then by some other field e.g. idJustice
@RussCam, Yeah I thought about it - sort by _score and then by id/date. But in the case of a request for a specific category I get a different _score, and sort does not need anything - all right. But in the case of aggregation I receive the same _score for some results, the order can be the same as in the case of a request for a specific category, but maybe not. And it's not clear whether i need to sort at all.Houseclean
Could you add the JSON queries as well?Prognathous
Yes. In the second query you have an extra field .Field(f => f.ContentName))Prognathous
@FilipCordas, .Field(f => f.ContentName)) its my misprint, i updated this row too.Houseclean
@AntonKomyshan Two things I see that can influence the score one is the extra field in query "category" try putting it to 0, and the second is the { "term":{ "category": { "value": "category1" } } } try putting it in the filter context so it doesn't add to the score.Prognathous
I
3

Your query has problems.

  1. What you are using is combination of must and should inside a must as part of bool query.

    So if you read more in this link, you can see for must

    The clause (query) must appear in matching documents and will contribute to the score.

    so it will five equal scoring to all your documents which matched the condition. Any other condition which didn't match the condition won't even be there in results to score.

    What you should do it use should query but outside of must query, so Elasticsearch will be able to score your documents correctly

  2. For more info as part of this question

    Can someone direct me to the right path how to solve this problem?

    you should pass 'explain': true in the query. You can read more about explain query and how to interpret results in this link.

  3. You answer for this question is

    how can I achieve the same order in the results?

    As every score is same therefore Elasticsearch can sort the result in any way it gets the response from its nodes.

Possible Solution:

You should reorganize your query to make real use of should query and its boosting capabilities. You can read more about boosting here.

I tried two query similar to yours but with correct usage of should and they gave me same order as expected. Your both query should be constructed as below:

{
  "from": 0,
  "size": 10,
  "_source": [
    "content1^3",
    "content2^2",
    "content3^1"
  ],
  "query": {
    "bool": {
      "should": [
        {
          "match": {
            "languageId": 1
          }
        },
        {
          "match": {
            "languageId": 0
          }
        }
      ],
      "must": [
        {
          "multi_match": {
            "boost": 1.1,
            "query": ".....",
            "fuzzy_rewrite": "constant_score_boolean",
            "fuzziness": 1,
            "cutoff_frequency": 0.5,
            "prefix_length": 1,
            "max_expansions": 100,
            "slop": 2,
            "lenient": true,
            "tie_breaker": 1,
            "minimum_should_match": 2,
            "operator": "or",
            "fields": [
              "content1^3",
              "content2^2",
              "content3^1",
              "category"
            ],
            "zero_terms_query": "all"
          }
        }
      ]
    }
  }
}

and second query as

{
  "size": 0,
  "query": {
    "bool": {
      "should": [
        {
          "match": {
            "languageId": 1
          }
        },
        {
          "match": {
            "languageId": 0
          }
        }
      ],
      "must": [
        {
          "multi_match": {
            "boost": 1.1,
            "query": ".....",
            "fuzzy_rewrite": "constant_score_boolean",
            "fuzziness": 1,
            "cutoff_frequency": 0.5,
            "prefix_length": 1,
            "max_expansions": 100,
            "slop": 2,
            "lenient": true,
            "tie_breaker": 1,
            "minimum_should_match": 2,
            "operator": "or",
            "fields": [
              "content1^3",
              "content2^2",
              "content3^1",
              "category"
            ],
            "zero_terms_query": "all"
          }
        }
      ]
    }
  },
  "aggs": {
    "categories": {
      "terms": {
        "field": "category",
        "size": 10
      },
      "aggs": {
        "produdtcs": {
          "top_hits": {
            "_source": [
              "content1^3",
              "content2^2",
              "content3^1"
            ],
            "size": 3
          }
        }
      }
    }
  }
}
Intermolecular answered 9/6, 2017 at 8:48 Comment(0)

© 2022 - 2024 — McMap. All rights reserved.