I have a big text file (5Mb) that I use in my Android application. I create the file as a list of pre-sorted Strings, and the file doesn't change once it is created. How can I perform a binary search on the contents of this file, without reading line-by-line to find the matching String?
Since the content of the file does not change, you can break the file into multiple pieces. Say A-G, H-N, 0-T and U-Z. This allows you to check the first character and immediately be able to cut the possible set to a fourth of the original size. Now a linear search will not take as long or reading the whole file could be an option. This process could be extended if n/4 is still too large, but the idea is the same. Build the search breakdowns into the file structure instead of trying to do it all in memory.
A 5MB file isn't that big - you should be able to read each line into a String[]
array, which you can then use java.util.Arrays.binarySearch()
to find the line you want. This is my recommended approach.
If you don't want to read the whole file in to your app, then it gets more complicated. If each line of the file is the same length, and the file is already sorted, then you can open the file in RandomAccessFile and perform a binary search yourself by using seek()
like this...
// open the file for reading
RandomAccessFile raf = new RandomAccessFile("myfile.txt","r");
String searchValue = "myline";
int lineSize = 50;
int numberOfLines = raf.length() / lineSize;
// perform the binary search...
byte[] lineBuffer = new byte[lineSize];
int bottom = 0;
int top = numberOfLines;
int middle;
while (bottom <= top){
middle = (bottom+top)/2;
raf.seek(middle*lineSize); // jump to this line in the file
raf.read(lineBuffer); // read the line from the file
String line = new String(lineBuffer); // convert the line to a String
int comparison = line.compareTo(searchValue);
if (comparison == 0){
// found it
break;
}
else if (comparison < 0){
// line comes before searchValue
bottom = middle + 1;
}
else {
// line comes after searchValue
top = middle - 1;
}
}
raf.close(); // close the file when you're finished
However, if the file doesn't have fixed-width lines, then you can't easily perform a binary search without loading it into memory first, as you can't quickly jump to a specific line in the file like you can with fixed-width lines.
In a uniform character length text file you could seek to the middle of the interval in question character wise, start reading characters until you hit your deliminator, then use the subsequent string as an approximation for the element wise middle. The problem with doing this in android, though, is you apparently can't get random access to a resource (although I suppose you could just reopen it every time). Furthermore this technique doesn't generalize to maps and sets of other types.
Another option would be to (using a RandomAccessFile) write an "array" of ints - one for each String - at the beginning of the file then go back and update them with the locations of their corresponding Strings. Again the search will require jumping around.
What I would do (and did do in my own app) is implement a hash set in a file. This one does separate chaining with trees.
import java.io.BufferedInputStream;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedList;
import java.util.Set;
class StringFileSet {
private static final double loadFactor = 0.75;
public static void makeFile(String fileName, String comment, Set<String> set) throws IOException {
new File(fileName).delete();
RandomAccessFile fout = new RandomAccessFile(fileName, "rw");
//Write comment
fout.writeUTF(comment);
//Make bucket array
int numBuckets = (int)(set.size()/loadFactor);
ArrayList<ArrayList<String>> bucketArray = new ArrayList<ArrayList<String>>(numBuckets);
for (int ii = 0; ii < numBuckets; ii++){
bucketArray.add(new ArrayList<String>());
}
for (String key : set){
bucketArray.get(Math.abs(key.hashCode()%numBuckets)).add(key);
}
//Sort key lists in preparation for creating trees
for (ArrayList<String> keyList : bucketArray){
Collections.sort(keyList);
}
//Make queues in preparation for creating trees
class NodeInfo{
public final int lower;
public final int upper;
public final long callingOffset;
public NodeInfo(int lower, int upper, long callingOffset){
this.lower = lower;
this.upper = upper;
this.callingOffset = callingOffset;
}
}
ArrayList<LinkedList<NodeInfo>> queueList = new ArrayList<LinkedList<NodeInfo>>(numBuckets);
for (int ii = 0; ii < numBuckets; ii++){
queueList.add(new LinkedList<NodeInfo>());
}
//Write bucket array
fout.writeInt(numBuckets);
for (int index = 0; index < numBuckets; index++){
queueList.get(index).add(new NodeInfo(0, bucketArray.get(index).size()-1, fout.getFilePointer()));
fout.writeInt(-1);
}
//Write trees
for (int bucketIndex = 0; bucketIndex < numBuckets; bucketIndex++){
while (queueList.get(bucketIndex).size() != 0){
NodeInfo nodeInfo = queueList.get(bucketIndex).poll();
if (nodeInfo.lower <= nodeInfo.upper){
//Set respective pointer in parent node
fout.seek(nodeInfo.callingOffset);
fout.writeInt((int)(fout.length() - (nodeInfo.callingOffset + 4))); //Distance instead of absolute position so that the get method can use a DataInputStream
fout.seek(fout.length());
int middle = (nodeInfo.lower + nodeInfo.upper)/2;
//Key
fout.writeUTF(bucketArray.get(bucketIndex).get(middle));
//Left child
queueList.get(bucketIndex).add(new NodeInfo(nodeInfo.lower, middle-1, fout.getFilePointer()));
fout.writeInt(-1);
//Right child
queueList.get(bucketIndex).add(new NodeInfo(middle+1, nodeInfo.upper, fout.getFilePointer()));
fout.writeInt(-1);
}
}
}
fout.close();
}
private final String fileName;
private final int numBuckets;
private final int bucketArrayOffset;
public StringFileSet(String fileName) throws IOException {
this.fileName = fileName;
DataInputStream fin = new DataInputStream(new BufferedInputStream(new FileInputStream(fileName)));
short numBytes = fin.readShort();
fin.skipBytes(numBytes);
this.numBuckets = fin.readInt();
this.bucketArrayOffset = numBytes + 6;
fin.close();
}
public boolean contains(String key) throws IOException {
boolean containsKey = false;
DataInputStream fin = new DataInputStream(new BufferedInputStream(new FileInputStream(this.fileName)));
fin.skipBytes(4*(Math.abs(key.hashCode()%this.numBuckets)) + this.bucketArrayOffset);
int distance = fin.readInt();
while (distance != -1){
fin.skipBytes(distance);
String candidate = fin.readUTF();
if (key.compareTo(candidate) < 0){
distance = fin.readInt();
}else if (key.compareTo(candidate) > 0){
fin.skipBytes(4);
distance = fin.readInt();
}else{
fin.skipBytes(8);
containsKey = true;
break;
}
}
fin.close();
return containsKey;
}
}
A test program
import java.io.File;
import java.io.IOException;
import java.util.HashSet;
class Test {
public static void main(String[] args) throws IOException {
HashSet<String> stringMemorySet = new HashSet<String>();
stringMemorySet.add("red");
stringMemorySet.add("yellow");
stringMemorySet.add("blue");
StringFileSet.makeFile("stringSet", "Provided under ... included in all copies and derivatives ...", stringMemorySet);
StringFileSet stringFileSet = new StringFileSet("stringSet");
System.out.println("orange -> " + stringFileSet.contains("orange"));
System.out.println("red -> " + stringFileSet.contains("red"));
System.out.println("yellow -> " + stringFileSet.contains("yellow"));
System.out.println("blue -> " + stringFileSet.contains("blue"));
new File("stringSet").delete();
System.out.println();
}
}
You'll also need to pass a Context to it, if and when you modify it for android, so it can access the getResources() method.
You're also probably going to want to stop the android build tools from compressing the file, which can apparently only be done - if you're working with the GUI - by changing the file's extension to something such as jpg. This made the process about 100 to 300 times faster in my app.
You might also look into giving yourself more memory by using the NDK.
Here's something I quickly put together. It uses two files, one with the words, the other with the offsets. The format of the offset file is this: the first 10 bits contains the word size, the last 22 bits contains the offset (the word position, for example, aaah would be 0, abasementable would be 4, etc.). It's encoded in big endian (java standard). Hope it helps somebody.
word.dat:
aaahabasementableabnormalabnormalityabortionistabortion-rightsabracadabra
wordx.dat:
00 80 00 00 01 20 00 04 00 80 00 0D 01 00 00 11 _____ __________
01 60 00 19 01 60 00 24 01 E0 00 2F 01 60 00 3E _`___`_$___/_`_>
I created these files in C#, but here's the code for it (it uses a txt file with words separated by crlfs)
static void Main(string[] args)
{
const string fIn = @"C:\projects\droid\WriteFiles\input\allwords.txt";
const string fwordxOut = @"C:\projects\droid\WriteFiles\output\wordx.dat";
const string fWordOut = @"C:\projects\droid\WriteFiles\output\word.dat";
int i = 0;
int offset = 0;
int j = 0;
var lines = File.ReadLines(fIn);
FileStream stream = new FileStream(fwordxOut, FileMode.Create, FileAccess.ReadWrite);
using (EndianBinaryWriter wwordxOut = new EndianBinaryWriter(EndianBitConverter.Big, stream))
{
using (StreamWriter wWordOut = new StreamWriter(File.Open(fWordOut, FileMode.Create)))
{
foreach (var line in lines)
{
wWordOut.Write(line);
i = offset | ((int)line.Length << 22); //first 10 bits to the left is the word size
offset = offset + (int)line.Length;
wwordxOut.Write(i);
//if (j == 7)
// break;
j++;
}
}
}
}
And this is the Java code for the binary file search:
public static void binarySearch() {
String TAG = "TEST";
String wordFilePath = Environment.getExternalStorageDirectory().getAbsolutePath() + "/word.dat";
String wordxFilePath = Environment.getExternalStorageDirectory().getAbsolutePath() + "/wordx.dat";
String target = "abracadabra";
boolean targetFound = false;
int searchCount = 0;
try {
RandomAccessFile raf = new RandomAccessFile(wordxFilePath, "r");
RandomAccessFile rafWord = new RandomAccessFile(wordFilePath, "r");
long low = 0;
long high = (raf.length() / 4) - 1;
int cur = 0;
long wordOffset = 0;
int len = 0;
while (high >= low) {
long mid = (low + high) / 2;
raf.seek(mid * 4);
cur = raf.readInt();
Log.v(TAG + "-cur", String.valueOf(cur));
len = cur >> 22; //word length
cur = cur & 0x3FFFFF; //first 10 bits are 0
rafWord.seek(cur);
byte [] bytes = new byte[len];
wordOffset = rafWord.read(bytes, 0, len);
Log.v(TAG + "-wordOffset", String.valueOf(wordOffset));
searchCount++;
String str = new String(bytes);
Log.v(TAG, str);
if (target.compareTo(str) < 0) {
high = mid - 1;
} else if (target.compareTo(str) == 0) {
targetFound = true;
break;
} else {
low = mid + 1;
}
}
raf.close();
rafWord.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
if (targetFound == true) {
Log.v(TAG + "-found " , String.valueOf(searchCount));
} else {
Log.v(TAG + "-not found " , String.valueOf(searchCount));
}
}
Though it might sound like overkill, don't store data you need to do this with as a flat file. Make a database and query the data in the database. This should be both effective and fast.
Here is a function that I think works (using this in practice). Lines can have any length. You have to supply a lambda called "nav" to do the actual line check so you are flexible in the file's order (case-sensitive, case-insensitive, ordered by a certain field etc.).
import java.io.File;
import java.io.RandomAccessFile;
class main {
// returns pair(character range in file, line) or null if not found
// if no exact match found, return line above
// nav takes a line and returns -1 (move up), 0 (found) or 1 (move down)
// The line supplied to nav is stripped of the trailing \n, but not the \r
// UTF-8 encoding is assumed
static Pair<LongRange, String> binarySearchForLineInTextFile(File file, IF1<String, Integer> nav) {
long length = l(file);
int bufSize = 1024;
RandomAccessFile raf = randomAccessFileForReading(file);
try {
long min = 0, max = length;
int direction = 0;
Pair<LongRange, String> possibleResult = null;
while (min < max) {
ping();
long middle = (min + max) / 2;
long lineStart = raf_findBeginningOfLine(raf, middle, bufSize);
long lineEnd = raf_findEndOfLine(raf, middle, bufSize);
String line = fromUtf8(raf_readFilePart(raf, lineStart, (int) (lineEnd - 1 - lineStart)));
direction = nav.get(line);
possibleResult = (Pair<LongRange, String>) new Pair(new LongRange(lineStart, lineEnd), line);
if (direction == 0) return possibleResult;
// asserts are to assure that loop terminates
if (direction < 0) max = assertLessThan(max, lineStart);
else min = assertBiggerThan(min, lineEnd);
}
if (direction >= 0) return possibleResult;
long lineStart = raf_findBeginningOfLine(raf, min - 1, bufSize);
String line = fromUtf8(raf_readFilePart(raf, lineStart, (int) (min - 1 - lineStart)));
return new Pair(new LongRange(lineStart, min), line);
} finally {
_close(raf);
}
}
static int l(byte[] a) {
return a == null ? 0 : a.length;
}
static long l(File f) {
return f == null ? 0 : f.length();
}
static RandomAccessFile randomAccessFileForReading(File path) {
try {
return new RandomAccessFile(path, "r");
} catch (Exception __e) {
throw rethrow(__e);
}
}
// you can change this function to allow interrupting long calculations from the outside. just throw a RuntimeException.
static boolean ping() {
return true;
}
static long raf_findBeginningOfLine(RandomAccessFile raf, long pos, int bufSize) {
try {
byte[] buf = new byte[bufSize];
while (pos > 0) {
long start = Math.max(pos - bufSize, 0);
raf.seek(start);
raf.readFully(buf, 0, (int) Math.min(pos - start, bufSize));
int idx = lastIndexOf_byteArray(buf, (byte) '\n');
if (idx >= 0) return start + idx + 1;
pos = start;
}
return 0;
} catch (Exception __e) {
throw rethrow(__e);
}
}
static long raf_findEndOfLine(RandomAccessFile raf, long pos, int bufSize) {
try {
byte[] buf = new byte[bufSize];
long length = raf.length();
while (pos < length) {
raf.seek(pos);
raf.readFully(buf, 0, (int) Math.min(length - pos, bufSize));
int idx = indexOf_byteArray(buf, (byte) '\n');
if (idx >= 0) return pos + idx + 1;
pos += bufSize;
}
return length;
} catch (Exception __e) {
throw rethrow(__e);
}
}
static String fromUtf8(byte[] bytes) {
try {
return bytes == null ? null : new String(bytes, "UTF-8");
} catch (Exception __e) {
throw rethrow(__e);
}
}
static byte[] raf_readFilePart(RandomAccessFile raf, long start, int l) {
try {
byte[] buf = new byte[l];
raf.seek(start);
raf.readFully(buf);
return buf;
} catch (Exception __e) {
throw rethrow(__e);
}
}
static <A> A assertLessThan(A a, A b) {
assertTrue(cmp(b, a) < 0);
return b;
}
static <A> A assertBiggerThan(A a, A b) {
assertTrue(cmp(b, a) > 0);
return b;
}
static void _close(AutoCloseable c) {
try {
if (c != null)
c.close();
} catch (Throwable e) {
throw rethrow(e);
}
}
static RuntimeException rethrow(Throwable t) {
throw t instanceof RuntimeException ? (RuntimeException) t : new RuntimeException(t);
}
static int lastIndexOf_byteArray(byte[] a, byte b) {
for (int i = l(a) - 1; i >= 0; i--)
if (a[i] == b)
return i;
return -1;
}
static int indexOf_byteArray(byte[] a, byte b) {
int n = l(a);
for (int i = 0; i < n; i++)
if (a[i] == b)
return i;
return -1;
}
static boolean assertTrue(boolean b) {
if (!b)
throw fail("oops");
return b;
}
static int cmp(Object a, Object b) {
if (a == null) return b == null ? 0 : -1;
if (b == null) return 1;
return ((Comparable) a).compareTo(b);
}
static RuntimeException fail(String msg) {
throw new RuntimeException(msg == null ? "" : msg);
}
final static class LongRange {
long start, end;
LongRange(long start, long end) {
this.end = end;
this.start = start;
}
public String toString() {
return "[" + start + ";" + end + "]";
}
}
interface IF1<A, B> {
B get(A a);
}
static class Pair<A, B> {
A a;
B b;
Pair(A a, B b) {
this.b = b;
this.a = a;
}
public String toString() {
return "<" + a + ", " + b + ">";
}
}
}
© 2022 - 2024 — McMap. All rights reserved.
contains()
method ofString
class on each line. – Smashed