Since you said "I'm more interested in runtime", here are some faster replacements for cursor += sum(1 for _ in group)
of your groupby
solution.
Using @Guy's data1
but all segment lengths divided by 10:
normal optimized
original 870 ms 871 ms
zip_count 612 ms 611 ms
count_of 344 ms 345 ms
list_index 387 ms 386 ms
length_hint 223 ms 222 ms
Using list(range(1000000))
instead:
normal optimized
original 385 ms 331 ms
zip_count 463 ms 401 ms
count_of 197 ms 165 ms
list_index 175 ms 143 ms
length_hint 226 ms 127 ms
The optimized versions use more local variables or list comprehensions.
I don't think __length_hint__
is guaranteed to be exact, not even for a list iterator, but it appears to be (passes my correctness checks) and I don't see why it wouldn't.
The code (Try it online!, but you'll have to reduce something to not exceed the time limit):
from timeit import default_timer as timer
from itertools import groupby, count
from collections import deque
from operator import countOf
def original(data):
cursor = 0
result = []
for key, group in groupby(data):
cursor += sum(1 for _ in group)
result.append(cursor)
return result
def original_opti(data):
cursor = 0
sum_ = sum
return [cursor := cursor + sum_(1 for _ in group)
for _, group in groupby(data)]
def zip_count(data):
cursor = 0
result = []
for key, group in groupby(data):
index = count()
deque(zip(group, index), 0)
cursor += next(index)
result.append(cursor)
return result
def zip_count_opti(data):
cursor = 0
result = []
append = result.append
count_, deque_, zip_, next_ = count, deque, zip, next
for key, group in groupby(data):
index = count_()
deque_(zip_(group, index), 0)
cursor += next_(index)
append(cursor)
return result
def count_of(data):
cursor = 0
result = []
for key, group in groupby(data):
cursor += countOf(group, key)
result.append(cursor)
return result
def count_of_opti(data):
cursor = 0
countOf_ = countOf
result = [cursor := cursor + countOf_(group, key)
for key, group in groupby(data)]
return result
def list_index(data):
cursor = 0
result = []
for key, _ in groupby(data):
cursor = data.index(key, cursor)
result.append(cursor)
del result[0]
result.append(len(data))
return result
def list_index_opti(data):
cursor = 0
index = data.index
groups = groupby(data)
next(groups, None)
result = [cursor := index(key, cursor)
for key, _ in groups]
result.append(len(data))
return result
def length_hint(data):
result = []
it = iter(data)
for _ in groupby(it):
result.append(len(data) - (1 + it.__length_hint__()))
del result[0]
result.append(len(data))
return result
def length_hint_opti(data):
it = iter(data)
groups = groupby(it)
next(groups, None)
n_minus_1 = len(data) - 1
length_hint = it.__length_hint__
result = [n_minus_1 - length_hint()
for _ in groups]
result.append(len(data))
return result
funcss = [
(original, original_opti),
(zip_count, zip_count_opti),
(count_of, count_of_opti),
(list_index, list_index_opti),
(length_hint, length_hint_opti),
]
data1 = [1] * 3 + [2] * 3 + [4] * 5 + [5] * 2 + [7] * 4 + [9] * 3 + [11] * 1 + [15] * 3
data1 = [x for x in data1 for _ in range(1000000)]
data2 = list(range(1000000))
for _ in range(3):
for name in 'data1', 'data2':
print(name)
data = eval(name)
expect = None
for funcs in funcss:
print(f'{funcs[0].__name__:11}', end='')
for func in funcs:
times = []
for _ in range(5):
start_time = timer()
result = func(data)
end_time = timer()
times.append(end_time - start_time)
print(f'{round(min(times) * 1e3):5d} ms', end='')
if expect is None:
expect = result
else:
assert result == expect
print()
print()
O(n)
andO(k*log n)
depends onk
, which you probably don't know upfront. If you're looking for the fastest algorithm in practice, though, I would expect that a simple for loop that checks elements against the previous one would be fastest, unlessk
is a lot smaller thann
. But you would need to benchmark that to be sure. – Rinsek
will be a lot smaller thann
. – Cowskinbisect(data, data[cursor], cursor)
also works and is nicer (less code and doesn't need the values to be integers). – Consternation