How can I sort in a round robin fashion in Lucene?
Asked Answered
C

1

0

I want to write a ComparatorSource for Lucene 3.6 with which I can accomplish a Round Robin style of sorting:

Let's say we have something like this:

A,A,A,B,C,D,D

And I want to have them sorted into this order (not neccessarily in alphabetical order)

A, B, C, D, A, D, A

How can I accomplish such a behaviour?

This is what I came up with, but it doesn't work because the timeshards arent updated in the correct place. Where can i do that instead?

public class RoundRobinFieldComparatorSource<T> extends FieldComparatorSource {

    private static long seed = new Random().nextInt();

    public static void setNewSeed(long seed) {
        RoundRobinFieldComparatorSource.seed = seed;
    }

    private static final long serialVersionUID = -8959374194451783596L;

    private final Set<T> roundRobinValues;
    private final boolean excluded;
    private final TwoWayStringBridge stringBridge;
    private final Set<String> preferredValues;

    public RoundRobinFieldComparatorSource(TwoWayStringBridge stringBridge,
            Set<T> roundRobinValues) {
        this(stringBridge, roundRobinValues, new HashSet<T>());
    }

    /**
     * same as
     * {@link #RoundRobinFieldComparatorSource(TwoWayStringBridge, Set, boolean)}
     * but with excluded set to false
     */
    public RoundRobinFieldComparatorSource(TwoWayStringBridge stringBridge,
            Set<T> roundRobinValues, Set<T> preferredValues) {
        this(stringBridge, roundRobinValues, preferredValues, false);
    }

    /**
     * @param excluded
     *            determines whether the passed roundRobinValues are to be
     *            excluded in scheduling
     */
    public RoundRobinFieldComparatorSource(TwoWayStringBridge stringBridge,
            Set<T> roundRobinValues, Set<T> preferredValues, boolean excluded) {
        this.stringBridge = stringBridge;
        this.roundRobinValues = roundRobinValues;
        this.excluded = excluded;
        this.preferredValues = new HashSet<>();
        if(preferredValues != null) {
            for(T val : preferredValues) {
                this.preferredValues.add(this.stringBridge.objectToString(val));
            }
        }
    }

    public static <T> Set<T> toSet(Collection<T> iterable) {
        Set<T> ret = new HashSet<>();
        ret.addAll(iterable);
        return ret;
    }

    @Override
    public FieldComparator<String> newComparator(final String fieldName,
            int numHits, int sortPos, final boolean reversed)
            throws IOException {
        return new FieldComparator<String>() {

            private String[] values;
            private String bottom;
            private String[] currentReaderValues;
            private Map<String, Integer> passedTimeShards;
            private Random random;

            @Override
            public int compare(int slot1, int slot2) {
                return this.compare(this.values[slot1], this.values[slot2]);
            }

            @Override
            public int compareBottom(int doc) throws IOException {
                return this.compare(this.bottom, this.currentReaderValues[doc]);
            }

            @Override
            public void copy(int slot, int doc) throws IOException {
                this.values[slot] = this.currentReaderValues[doc];
            }

            @Override
            public void setBottom(int slot) {
                this.bottom = this.values[slot];
            }

            @Override
            public void setNextReader(IndexReader reader, int docBase)
                    throws IOException {
                // TODO: maybe do a copy?
                this.currentReaderValues = FieldCache.DEFAULT.getStrings(
                        reader, fieldName);
                this.values = new String[reader.maxDoc()];
                this.passedTimeShards = new HashMap<>();
                this.random = new Random(seed);
                // initialize our timeshards
                // and make sure the values that have not been passed
                // and were returned have their timeshard equal to
                // Integer.MAX_VALUE so they won't be "scheduled"
                // for a spot in the result list
                for (String string : this.currentReaderValues) {
                    @SuppressWarnings("unchecked")
                    T object = (T) RoundRobinFieldComparatorSource.this.stringBridge
                            .stringToObject(string);
                    boolean scheduled = RoundRobinFieldComparatorSource.this.roundRobinValues
                            .contains(object);
                    if (RoundRobinFieldComparatorSource.this.excluded) {
                        scheduled = !scheduled;
                    }
                    int curTimeShard = 0;
                    if (!scheduled) {
                        curTimeShard = Integer.MAX_VALUE;
                    }
                    this.passedTimeShards.put(string, curTimeShard);
                }
            }

            @Override
            public String value(int slot) {
                return this.values[slot];
            }

            private int compare(String first, String second) {
                if (first == null && second == null) {
                    return 0;
                } else if (first == null) {
                    return 1;
                } else if (second == null) {
                    return -1;
                }
                Integer firstTimeShard = this.passedTimeShards.get(first);
                Integer secondTimeShard = this.passedTimeShards.get(second);
                int result = Integer.compare(firstTimeShard, secondTimeShard);
                if (result == 0) {
                    // now check for preferred values (these are the first ones
                    // to be "scheduled" in each shard)
                    if (RoundRobinFieldComparatorSource.this.preferredValues
                            .contains(first)
                            && RoundRobinFieldComparatorSource.this.preferredValues
                                    .contains(second)) {
                        // coin flip if equal
                        if (this.random.nextBoolean()) {
                            result = 1;
                        } else {
                            result = -1;
                        }
                    } else if (RoundRobinFieldComparatorSource.this.preferredValues
                            .contains(first)) {
                        result = -1;
                    } else if (RoundRobinFieldComparatorSource.this.preferredValues
                            .contains(second)) {
                        result = 1;
                    } else {
                        // coin flip if equal
                        if (this.random.nextBoolean()) {
                            result = 1;
                        } else {
                            result = -1;
                        }
                    }
                }
                if (reversed) {
                    result *= -1;
                }
                // and add one timeshard to the "winner" of this round
                if (result < 0) {
                    firstTimeShard += 1;
                    this.passedTimeShards.put(first, firstTimeShard);
                } else {
                    secondTimeShard += 1;
                    this.passedTimeShards.put(second, secondTimeShard);
                }
                return result;
            }

        };
    }
}

Thanks in advance.

Chloroplast answered 3/2, 2014 at 13:20 Comment(2)
One approach could be to compute the sorting in the setNextReader method (put the sorting position in the values array) and then basically do the sorting with these values.Chloroplast
I got it. I will post it as soon as I have verified that it works correctly.Chloroplast
C
0
package de.indv.rentino.mk3.hibernate.lucene.search.sort;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;

import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.FieldCache;
import org.apache.lucene.search.FieldComparator;
import org.apache.lucene.search.FieldComparatorSource;
import org.hibernate.search.bridge.TwoWayStringBridge;

/**
 * Round Robin Sorting for Lucene
 * 
 * @author Martin Braun
 */
public class RoundRobinFieldComparatorSource<T> extends FieldComparatorSource {

    private static long seed = new Random().nextInt();

    public static void setNewSeed(long seed) {
        RoundRobinFieldComparatorSource.seed = seed;
    }

    private static final long serialVersionUID = -8959374194451783596L;

    private final Set<T> roundRobinValues;
    private final boolean excluded;
    private final TwoWayStringBridge stringBridge;
    private final Set<String> preferredValues;

    public RoundRobinFieldComparatorSource(TwoWayStringBridge stringBridge,
            Set<T> roundRobinValues) {
        this(stringBridge, roundRobinValues, new HashSet<T>());
    }

    /**
     * same as
     * {@link #RoundRobinFieldComparatorSource(TwoWayStringBridge, Set, boolean)}
     * but with excluded set to false
     */
    public RoundRobinFieldComparatorSource(TwoWayStringBridge stringBridge,
            Set<T> roundRobinValues, Set<T> preferredValues) {
        this(stringBridge, roundRobinValues, preferredValues, false);
    }

    /**
     * @param excluded
     *            determines whether the passed roundRobinValues are to be
     *            excluded in scheduling
     */
    public RoundRobinFieldComparatorSource(TwoWayStringBridge stringBridge,
            Set<T> roundRobinValues, Set<T> preferredValues, boolean excluded) {
        this.stringBridge = stringBridge;
        this.roundRobinValues = roundRobinValues;
        this.excluded = excluded;
        this.preferredValues = new HashSet<>();
        if (preferredValues != null) {
            for (T val : preferredValues) {
                this.preferredValues.add(this.stringBridge.objectToString(val));
            }
        }
    }

    public static <T> Set<T> toSet(Collection<T> iterable) {
        Set<T> ret = new HashSet<>();
        ret.addAll(iterable);
        return ret;
    }

    @Override
    public FieldComparator<Integer> newComparator(final String fieldName,
            int numHits, int sortPos, final boolean reversed)
            throws IOException {
        return new FieldComparator<Integer>() {

            private Integer[] values;
            private Integer[] slotsForDocs;
            private Integer bottom;

            @Override
            public void setNextReader(IndexReader reader, int docBase)
                    throws IOException {
                final Random random = new Random(
                        RoundRobinFieldComparatorSource.seed);
                // TODO: maybe do a copy?
                String[] strings = FieldCache.DEFAULT.getStrings(reader,
                        fieldName);
                this.values = new Integer[reader.maxDoc()];
                Map<String, List<Integer>> scheduledOccurencesPerString = new HashMap<>();
                Map<String, List<Integer>> unscheduledOccurencesPerString = new HashMap<>();
                List<String> order = new ArrayList<>();
                List<String> excludedOrder = new ArrayList<>();
                {
                    {
                        Set<String> alreadyAdded = new HashSet<>();
                        for (int i = 0; i < strings.length; ++i) {
                            String string = strings[i];
                            this.values[i] = Integer.MAX_VALUE;
                            if (string != null) {
                                @SuppressWarnings("unchecked")
                                T object = (T) RoundRobinFieldComparatorSource.this.stringBridge
                                        .stringToObject(string);
                                boolean scheduled = RoundRobinFieldComparatorSource.this.roundRobinValues
                                        .contains(object);
                                if (RoundRobinFieldComparatorSource.this.excluded) {
                                    scheduled = !scheduled;
                                }
                                if (scheduled) {
                                    if (!alreadyAdded.contains(string)) {
                                        alreadyAdded.add(string);
                                        order.add(string);
                                    }
                                    List<Integer> occurences = scheduledOccurencesPerString
                                            .get(string);
                                    if (occurences == null) {
                                        occurences = new ArrayList<>();
                                        scheduledOccurencesPerString.put(
                                                string, occurences);
                                    }
                                    occurences.add(i);
                                } else {
                                    if (!alreadyAdded.contains(string)) {
                                        alreadyAdded.add(string);
                                        excludedOrder.add(string);
                                    }
                                    List<Integer> occurences = unscheduledOccurencesPerString
                                            .get(string);
                                    if (occurences == null) {
                                        occurences = new ArrayList<>();
                                        unscheduledOccurencesPerString.put(
                                                string, occurences);
                                    }
                                    occurences.add(i);
                                }
                            }
                        }
                    }
                    sortByPreference(
                            order,
                            random,
                            RoundRobinFieldComparatorSource.this.preferredValues);
                    sortByPreference(excludedOrder, random,
                            new HashSet<String>());
                }
                int sortIndex = 0;
                // schedule the included ones
                sortIndex = schedule(sortIndex, order,
                        scheduledOccurencesPerString, values);
                // schedule the excluded ones
                sortIndex = schedule(sortIndex, excludedOrder,
                        unscheduledOccurencesPerString, values);
                // slotsForDocs equals the values before the actual sorting has
                // started
                // in the lucene methods so we can just copy the values array
                this.slotsForDocs = new Integer[values.length];
                System.arraycopy(this.values, 0, this.slotsForDocs, 0,
                        this.values.length);
            }

            @Override
            public int compare(int slot1, int slot2) {
                return Integer.compare(this.values[slot1], this.values[slot2]);
            }

            @Override
            public int compareBottom(int doc) throws IOException {
                return Integer.compare(this.bottom, this.slotsForDocs[doc]);
            }

            @Override
            public void setBottom(int slot) {
                this.bottom = this.values[slot];
            }

            @Override
            public void copy(int slot, int doc) throws IOException {
                this.values[slot] = this.slotsForDocs[doc];
            }

            @Override
            public Integer value(int slot) {
                return this.values[slot];
            }

        };
    }

    private static void sortByPreference(List<String> list,
            final Random random, final Set<String> preferredValues) {
        //and shuffle before we do anything dumb
        Collections.shuffle(list, random);
        Collections.sort(list, new Comparator<String>() {

            @Override
            public int compare(String first, String second) {
                int result;
                if (preferredValues.contains(first)
                        && preferredValues.contains(second)) {
                    result = 0;
                } else if (preferredValues.contains(first)) {
                    result = -1;
                } else if (preferredValues.contains(second)) {
                    result = 1;
                } else {
                    result = 0;
                }
                return result;
            }

        });
    }

    private static int schedule(int sortIndex, List<String> order,
            Map<String, List<Integer>> positions, Integer[] values) {
        while (positions.size() > 0) {
            // repeatedly iterate over the order
            // and put the index for the next element
            // with the same string if possible
            for (int i = 0; i < order.size(); ++i) {
                String cur = order.get(i);
                if (positions.get(cur) != null) {
                    // get an occurence of the string
                    Integer pos = positions.get(cur).remove(0);
                    // and put the sort index into the values array
                    values[pos] = sortIndex++;
                    if (positions.get(cur).size() == 0) {
                        positions.remove(cur);
                    }
                }
            }
        }
        return sortIndex;
    }

}

That's my solution.

Chloroplast answered 6/2, 2014 at 13:28 Comment(0)

© 2022 - 2024 — McMap. All rights reserved.