Because my old work often handles 1 million lines of fixwidth data, I did research on this issue when I started using Python.
There are 2 types of FixedWidth
- ASCII FixedWidth (ascii character length = 1, double-byte encoded character length = 2)
- Unicode FixedWidth (ascii character & double-byte encoded character length = 1)
If the resource string is all composed of ascii characters, then ASCII FixedWidth = Unicode FixedWidth
Fortunately, string and byte are different in py3, which reduces a lot of confusion when dealing with double-byte encoded characters (e.g.gbk, big5, euc-jp, shift-jis, etc.).
For the processing of "ASCII FixedWidth", the String is usually converted to Bytes and then split.
Without importing third-party modules
totalLineCount = 1 million, lineLength = 800 byte , FixedWidthArgs=(10,25,4,....), I split the Line in about 5 ways and get the following conclusion:
- struct is the fastest (1x)
- Loop only, not pre-processing FixedWidthArgs is the slowest (5x+)
slice(bytes)
is faster than slice(string)
- The source string is the bytes test result: struct(1x) , operator.itemgetter(1.7x) , precompiled sliceObject & list comprehensions(2.8x), re.patten object (2.9x)
When dealing with large files, we often use with open ( file, "rb") as f:
.
The method traverses one of the above files, about 2.4 second.
I think the appropriate handler, which processes 1 million rows of data, splits each row into 20 fields and takes less than 2.4 seconds.
I only find that stuct
and itemgetter
meet the requirements
ps: For normal display, I converted unicode str to bytes.
If you are in a double-byte environment, you don't need to do this.
from itertools import accumulate
from operator import itemgetter
def oprt_parser(sArgs):
sum_arg = tuple(accumulate(abs(i) for i in sArgs))
# Negative parameter field index
cuts = tuple(i for i,num in enumerate(sArgs) if num < 0)
# Get slice args and Ignore fields of negative length
ig_Args = tuple(item for i, item in enumerate(zip((0,)+sum_arg,sum_arg)) if i not in cuts)
# Generate `operator.itemgetter` object
oprtObj =itemgetter(*[slice(s,e) for s,e in ig_Args])
return oprtObj
lineb = b'abcdefghijklmnopqrstuvwxyz\xb0\xa1\xb2\xbb\xb4\xd3\xb5\xc4\xb6\xee\xb7\xa2\xb8\xf6\xba\xcd0123456789'
line = lineb.decode("GBK")
# Unicode Fixed Width
fieldwidthsU = (13, -13, 4, -4, 5,-5) # Negative width fields is ignored
# ASCII Fixed Width
fieldwidths = (13, -13, 8, -8, 5,-5) # Negative width fields is ignored
# Unicode FixedWidth processing
parse = oprt_parser(fieldwidthsU)
fields = parse(line)
print('Unicode FixedWidth','fields: {}'.format(tuple(map(lambda s: s.encode("GBK"), fields))))
# ASCII FixedWidth processing
parse = oprt_parser(fieldwidths)
fields = parse(lineb)
print('ASCII FixedWidth','fields: {}'.format(fields))
line = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789\n'
fieldwidths = (2, -10, 24)
parse = oprt_parser(fieldwidths)
fields = parse(line)
print(f"fields: {fields}")
Output:
Unicode FixedWidth fields: (b'abcdefghijklm', b'\xb0\xa1\xb2\xbb\xb4\xd3\xb5\xc4', b'01234')
ASCII FixedWidth fields: (b'abcdefghijklm', b'\xb0\xa1\xb2\xbb\xb4\xd3\xb5\xc4', b'01234')
fields: ('AB', 'MNOPQRSTUVWXYZ0123456789')
oprt_parser
is 4x make_parser
(list comprehensions + slice)
During the research, it was found that when the cpu speed is faster, it seems that the efficiency of the re
method increases faster.
Since I don't have more and better computers to test, provide my test code, if anyone is interested, you can test it with a faster computer.
Run Environment:
- os:win10
- python: 3.7.2
- CPU:amd athlon x3 450
- HD:seagate 1T
import timeit
import time
import re
from itertools import accumulate
from operator import itemgetter
def eff2(stmt,onlyNum= False,showResult=False):
'''test function'''
if onlyNum:
rl = timeit.repeat(stmt=stmt,repeat=roundI,number=timesI,globals=globals())
avg = sum(rl) / len(rl)
return f"{avg * (10 ** 6)/timesI:0.4f}"
else:
rl = timeit.repeat(stmt=stmt,repeat=10,number=1000,globals=globals())
avg = sum(rl) / len(rl)
print(f"【{stmt}】")
print(f"\tquick avg = {avg * (10 ** 6)/1000:0.4f} s/million")
if showResult:
print(f"\t Result = {eval(stmt)}\n\t timelist = {rl}\n")
else:
print("")
def upDouble(argList,argRate):
return [c*argRate for c in argList]
tbStr = "000000001111000002222真2233333333000000004444444QAZ55555555000000006666666ABC这些事中文字abcdefghijk"
tbBytes = tbStr.encode("GBK")
a20 = (4,4,2,2,2,3,2,2, 2 ,2,8,8,7,3,8,8,7,3, 12 ,11)
a20U = (4,4,2,2,2,3,2,2, 1 ,2,8,8,7,3,8,8,7,3, 6 ,11)
Slng = 800
rateS = Slng // 100
tStr = "".join(upDouble(tbStr , rateS))
tBytes = tStr.encode("GBK")
spltArgs = upDouble( a20 , rateS)
spltArgsU = upDouble( a20U , rateS)
testList = []
timesI = 100000
roundI = 5
print(f"test round = {roundI} timesI = {timesI} sourceLng = {len(tStr)} argFieldCount = {len(spltArgs)}")
print(f"pure str \n{''.ljust(60,'-')}")
# ==========================================
def str_parser(sArgs):
def prsr(oStr):
r = []
r_ap = r.append
stt=0
for lng in sArgs:
end = stt + lng
r_ap(oStr[stt:end])
stt = end
return tuple(r)
return prsr
Str_P = str_parser(spltArgsU)
# eff2("Str_P(tStr)")
testList.append("Str_P(tStr)")
print(f"pure bytes \n{''.ljust(60,'-')}")
# ==========================================
def byte_parser(sArgs):
def prsr(oBytes):
r, stt = [], 0
r_ap = r.append
for lng in sArgs:
end = stt + lng
r_ap(oBytes[stt:end])
stt = end
return r
return prsr
Byte_P = byte_parser(spltArgs)
# eff2("Byte_P(tBytes)")
testList.append("Byte_P(tBytes)")
# re,bytes
print(f"re compile object \n{''.ljust(60,'-')}")
# ==========================================
def rebc_parser(sArgs,otype="b"):
re_Args = "".join([f"(.{{{n}}})" for n in sArgs])
if otype == "b":
rebc_Args = re.compile(re_Args.encode("GBK"))
else:
rebc_Args = re.compile(re_Args)
def prsr(oBS):
return rebc_Args.match(oBS).groups()
return prsr
Rebc_P = rebc_parser(spltArgs)
# eff2("Rebc_P(tBytes)")
testList.append("Rebc_P(tBytes)")
Rebc_Ps = rebc_parser(spltArgsU,"s")
# eff2("Rebc_Ps(tStr)")
testList.append("Rebc_Ps(tStr)")
print(f"struct \n{''.ljust(60,'-')}")
# ==========================================
import struct
def struct_parser(sArgs):
struct_Args = " ".join(map(lambda x: str(x) + "s", sArgs))
def prsr(oBytes):
return struct.unpack(struct_Args, oBytes)
return prsr
Struct_P = struct_parser(spltArgs)
# eff2("Struct_P(tBytes)")
testList.append("Struct_P(tBytes)")
print(f"List Comprehensions + slice \n{''.ljust(60,'-')}")
# ==========================================
import itertools
def slice_parser(sArgs):
tl = tuple(itertools.accumulate(sArgs))
slice_Args = tuple(zip((0,)+tl,tl))
def prsr(oBytes):
return [oBytes[s:e] for s, e in slice_Args]
return prsr
Slice_P = slice_parser(spltArgs)
# eff2("Slice_P(tBytes)")
testList.append("Slice_P(tBytes)")
def sliceObj_parser(sArgs):
tl = tuple(itertools.accumulate(sArgs))
tl2 = tuple(zip((0,)+tl,tl))
sliceObj_Args = tuple(slice(s,e) for s,e in tl2)
def prsr(oBytes):
return [oBytes[so] for so in sliceObj_Args]
return prsr
SliceObj_P = sliceObj_parser(spltArgs)
# eff2("SliceObj_P(tBytes)")
testList.append("SliceObj_P(tBytes)")
SliceObj_Ps = sliceObj_parser(spltArgsU)
# eff2("SliceObj_Ps(tStr)")
testList.append("SliceObj_Ps(tStr)")
print(f"operator.itemgetter + slice object \n{''.ljust(60,'-')}")
# ==========================================
def oprt_parser(sArgs):
sum_arg = tuple(accumulate(abs(i) for i in sArgs))
cuts = tuple(i for i,num in enumerate(sArgs) if num < 0)
ig_Args = tuple(item for i,item in enumerate(zip((0,)+sum_arg,sum_arg)) if i not in cuts)
oprtObj =itemgetter(*[slice(s,e) for s,e in ig_Args])
return oprtObj
Oprt_P = oprt_parser(spltArgs)
# eff2("Oprt_P(tBytes)")
testList.append("Oprt_P(tBytes)")
Oprt_Ps = oprt_parser(spltArgsU)
# eff2("Oprt_Ps(tStr)")
testList.append("Oprt_Ps(tStr)")
print("|".join([s.split("(")[0].center(11," ") for s in testList]))
print("|".join(["".center(11,"-") for s in testList]))
print("|".join([eff2(s,True).rjust(11," ") for s in testList]))
Output:
Test round = 5 timesI = 100000 sourceLng = 744 argFieldCount = 20
...
...
Str_P | Byte_P | Rebc_P | Rebc_Ps | Struct_P | Slice_P | SliceObj_P|SliceObj_Ps| Oprt_P | Oprt_Ps
-----------|-----------|-----------|-----------|-- ---------|-----------|-----------|-----------|---- -------|-----------
9.6315| 7.5952| 4.4187| 5.6867| 1.5123| 5.2915| 4.2673| 5.7121| 2.4713| 3.9051