Loop though ElasticSearch documents source array in painless
Asked Answered
C

1

3

I have the following ElasticSearch data structure for products in a webshop:

{
  "_index": "vue_storefront_catalog_1_product_1617378559",
  "_type": "_doc",
  "_source": {
    "configurable_children": [
      {
        "price": 49.99,
        "special_price": 34.99,
        "special_from_date": "2020-11-27 00:00:00",
        "special_to_date": "2020-11-30 23:59:59",
        "stock": {
          "qty": 0,
          "is_in_stock": false,
          "stock_status": 0
        }
      }
      {
        "price": 49.99,
        "special_price": null,
        "special_from_date": null,
        "special_to_date": null,
        "stock": {
          "qty": 0,
          "is_in_stock": false,
          "stock_status": 0
        }
      }
    ]
}

Using the following mapping:

{
  "vue_storefront_catalog_1_product_1614928276" : {
    "mappings" : {
      "properties" : {
        "configurable_children" : {
          "properties" : {
            "price" : {
              "type" : "double"
            },
            "special_from_date" : {
              "type" : "date",
              "format" : "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_millis"
            },
            "special_price" : {
              "type" : "double"
            },
            "special_to_date" : {
              "type" : "date",
              "format" : "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_millis"
            },
          }
        }
      }
    }
  }
}

I have created a Elasticsearch query to filter out only products that are in sale, that means: the special_price must be lower than the price and the current date must be between the special_from_date and special_to_date.

This is the Painless script I have created:

  boolean hasSale = false;

  long timestampNow = new Date().getTime();
  if (doc.containsKey('configurable_children.special_from_date') && !doc['configurable_children.special_from_date'].empty) {
    long timestampSpecialFromDate = doc['configurable_children.special_from_date'].value.toInstant().toEpochMilli();
    if (timestampSpecialFromDate > timestampNow) {
      hasSale = false;
    }
  } else if (doc.containsKey('configurable_children.special_to_date') && !doc['configurable_children.special_to_date'].empty) {
    long timestampSpecialToDate = doc['configurable_children.special_to_date'].value.toInstant().toEpochMilli();
    if (timestampSpecialToDate < timestampNow) {
      hasSale = false;
    }
  } else if (doc.containsKey('configurable_children.stock.is_in_stock') && doc['configurable_children.stock.is_in_stock'].value == false) {
      hasSale = false;
  } else if (1 - (doc['configurable_children.special_price'].value / doc['configurable_children.price'].value) > params.fraction) {
    hasSale = true;
  }

  return hasSale

This returns the product as soon as one of the configurable_children has met the criteria to be a sale product. This is incorrect, because I need to loop through the whole set op configurable_children to determine if it's a sale product. How can I make sure all children are taken into the calculation? With a loop?


Here is the new query as suggested by Joe in the answers:

GET vue_storefront_catalog_1_product/_search
{
  "query": {
    "function_score": {
      "query": {
        "match_all": {}
      },
      "functions": [
        {
          "script_score": {
            "script": {
              "source": """
                int allEntriesAreTrue(def arrayList) {
                  return arrayList.stream().allMatch(Boolean::valueOf) == true ? 1 : 0
                } 
                
                ArrayList childrenAreMatching = [];
                
                long timestampNow = params.timestampNow;
                
                ArrayList children = params._source['configurable_children'];
                
                if (children == null || children.size() == 0) {
                  return allEntriesAreTrue(childrenAreMatching);
                }
                
                for (config in children) {
                  if (!config.containsKey('stock')) {
                    childrenAreMatching.add(false);
                    continue;
                  } else if (!config['stock']['is_in_stock']
                      || config['special_price'] == null
                      || config['special_from_date'] == null 
                      || config['special_to_date'] == null) {
                    childrenAreMatching.add(false);
                    continue;
                  } 
                  
                  if (config['special_from_date'] != null && config['special_to_date'] != null) {
                    SimpleDateFormat sf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
                    def from_millis = sf.parse(config['special_from_date']).getTime();
                    def to_millis = sf.parse(config['special_to_date']).getTime();
                    
                    if (!(timestampNow >= from_millis && timestampNow <= to_millis)) {
                      childrenAreMatching.add(false);
                      continue;
                    }
                  }
                  
                  def sale_fraction = 1 - (config['special_price'] / config['price']);
                  if (sale_fraction <= params.fraction) {
                    childrenAreMatching.add(false);
                    continue;
                  }
                  
                  childrenAreMatching.add(true);
                }
                return allEntriesAreTrue(childrenAreMatching);
              """,
              "params": {
                "timestampNow": 1617393889567,
                "fraction": 0.1
              }
            }
          }
        }
      ],
      "min_score": 1
    }
  }
}

The response is as follows:

{
  "took" : 15155,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 2936,
      "relation" : "eq"
    },
    "max_score" : 1.0,
    "hits" : [... hits here ...]
  }
}

Any idea why the query takes around 15 seconds?

Cletacleti answered 2/4, 2021 at 19:8 Comment(2)
Is configurable_children mapped as a nested field type?Selfpropelled
@joesorocin is is mapped as an object with properties underneath. I have added it to the post. It would be great if you have a lead!Cletacleti
S
4

Your intuition is right — you'll need to use a for loop if you want to check all of the array list objects.

Now, before I jump onto the iteration aspect, there's one important thing to know about arrays in Elasticsearch. When they're not defined as nested, their content will be flattened and the relationships between the individual key/value pairs will be lost. As such, you should definitely adjust your mapping like so:

{
  "vue_storefront_catalog_1_product_1614928276" : {
    "mappings" : {
      "properties" : {
        "configurable_children" : {
          "type": "nested",        <---
          "properties" : {
            "price" : {
              "type" : "double"
            },
            ...
          }
        }
      }
    }
  }
}

and reindex your data to ensure that the configurable_children are treated as separate, standalone entities.

As soon as they're mapped as nested, you'll be able to retrieve just those children that do match your scripted condition:

POST vue_storefront_catalog_1_product_1614928276/_search
{
  "_source": "configurable_children_that_match", 
  "query": {
    "nested": {
      "path": "configurable_children",
      "inner_hits": {
        "name": "configurable_children_that_match"
      }, 
      "query": {
        "bool": {
          "must": [
            {
              "script": {
                "script": {
                  "source": """
                    boolean hasSale = false;
                    
                    long timestampNow = new Date().getTime();
                    
                    if (doc.containsKey('configurable_children.special_from_date') && !doc['configurable_children.special_from_date'].empty) {
                      long timestampSpecialFromDate = doc['configurable_children.special_from_date'].value.toInstant().toEpochMilli();
                      if (timestampSpecialFromDate > timestampNow) {
                       return false
                      }
                    } 
                    
                    if (doc.containsKey('configurable_children.special_to_date') && !doc['configurable_children.special_to_date'].empty) {
                      long timestampSpecialToDate = doc['configurable_children.special_to_date'].value.toInstant().toEpochMilli();
                      if (timestampSpecialToDate < timestampNow) {
                        return false
                      }
                    }
                    
                    if (doc.containsKey('configurable_children.stock.is_in_stock') && doc['configurable_children.stock.is_in_stock'].value == false) {
                        return false
                    }
                    
                    if (1 - (doc['configurable_children.special_price'].value / doc['configurable_children.price'].value) > params.fraction) {
                      hasSale = true;
                    }
                    
                    return hasSale
                  """,
                  "params": {
                    "fraction": 0.1
                  }
                }
              }
            }
          ]
        }
      }
    }
  }
}

Two things to note here:

  1. The inner_hits attribute of a nested query allows you to let Elasticsearch know that you're only interested in those children that truly matched. Otherwise, all configurable_children would be returned. When specified in the _source parameter, the original, full JSON document source would be skipped and only the named inner_hits would be returned.
  2. Due to the distributed nature of ES, it's not recommended to use java's new Date(). I've explained the reasoning behind it my answer to How to get current time as unix timestamp for script use. You'll see me use a parametrized now in the query at the bottom of this answer.

Moving on, it's important to mention that nested objects are internally represented internally as separate subdocuments.

A side effect of this fact is that once you're inside a nested query's context, you don't have access to other nested children of the very same document.

In order to mitigate this, it's customary to regularly keep the nested children in sync such that when you do flatten one of the objects' attributes for use on the top-level, you can use a simply iterate the respective doc values. This flattening is usually done through the copy_to feature which I illustrated in my answer to How to iterate through a nested array in elasticsearch with filter script?

In your particular use case, this'd mean that you'd, for instance, use copy_to on the field stock.is_in_stock which'd result in a top-level boolean array list which is easier to work with than an array list of objects.

So far so good but you'd still be missing a way to filter based on the special_dates.

Now, regardless of whether you're dealing with nested or regular object field types, accessing params._source in regular script queries doesn't work in ES since v6.4.

However, there's still one type of query that does support iterating the _source — enter function_score queries.

As stated in your question, you

..need to loop through the whole set of configurable_children to determine if it's a sale product..

With that being said, here's how my query below works:

  1. The function_score query typically generates a custom calculated score, but it can, with the help of min_score, be used as a boolean yes/no filter to exclude docs whose configurable_children do not fulfil a certain condition.
  2. As the configurable_children are being iterated, each loop appends a boolean to childrenAreMatching which is then passed onto the allEntriesAreTrue helper which returns a 1 if they are, and a 0 if not.
  3. The dates are parsed and compared with the parametrized now; the fraction is compared too. If, at any point, some condition fails, the loop jumps to the next iteration.
POST vue_storefront_catalog_1_product_1614928276/_search
{
  "query": {
    "function_score": {
      "query": {
        "match_all": {}
      },
      "functions": [
        {
          "script_score": {
            "script": {
              "source": """
                // casting helper
                int allEntriesAreTrue(def arrayList) {
                  return arrayList.stream().allMatch(Boolean::valueOf) == true ? 1 : 0
                } 
                
                ArrayList childrenAreMatching = [];
                
                long timestampNow = params.timestampNow;
                
                ArrayList children = params._source['configurable_children'];
                
                if (children == null || children.size() == 0) {
                  return allEntriesAreTrue(childrenAreMatching);
                }
                
                for (config in children) {
                  if (!config['stock']['is_in_stock']
                      || config['special_price'] == null
                      || config['special_from_date'] == null 
                      || config['special_to_date'] == null) {
                    // nothing to do here...
                    childrenAreMatching.add(false);
                    continue;
                  } 
                  
                  SimpleDateFormat sf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
                  def from_millis = sf.parse(config['special_from_date']).getTime();
                  def to_millis = sf.parse(config['special_to_date']).getTime();
                  
                  if (!(timestampNow >= from_millis && timestampNow <= to_millis)) {
                    // not in date range
                    childrenAreMatching.add(false);
                    continue;
                  }
                  
                  def sale_fraction = 1 - (config['special_price'] / config['price']);
                  if (sale_fraction <= params.fraction) {
                    // fraction condition not met
                    childrenAreMatching.add(false);
                    continue;
                  }
                  
                  childrenAreMatching.add(true);
                }
                
                // need to return a number because it's a script score query
                return allEntriesAreTrue(childrenAreMatching);
              """,
              "params": {
                "timestampNow": 1617393889567,
                "fraction": 0.1
              }
            }
          }
        }
      ],
      "min_score": 1
    }
  }
}

All in all, only those documents, whose all configurable_children fulfil the specified conditions, would be returned.


P.S. If you learned something from this answer and want to learn more, I dedicated a whole chapter to ES scripts in my Elasticsearch Handbook.

Selfpropelled answered 5/4, 2021 at 17:39 Comment(2)
Thanks a lot for the extensive answer Joe! It makes it so much more clear. I think I miss the fundamentals of Elasticsearch and will be looking into your handbook! Looks very nice and practical! I have modified the query a bit and posted it in the question above. The only downside now is that the complete query takes around 15/20 seconds! Is there a possibility to do something about this do you think?Cletacleti
You're welcome! Take your time with the book when you purchase it -- there's a bunch of interesting things in there :) Regarding the query speed -- scripts are typically the last resort. If there's any way of remodelling your data such that it's, for instance, child-based and the seldom-changed product attributes are duplicated for all children, go for it! The suggested script is essentially doing something that should really be done outside of ES. Hope it makes sense.Selfpropelled

© 2022 - 2024 — McMap. All rights reserved.