I wrote a code to read simple-text/bz2-compressed-file. I used magic-characters of bz2 file to detect the file is compressed or not
NOTE "user may or may not provide file with proper extension"
my code
#include <iostream>
#include <sstream>
#include <vector>
#include <boost/iostreams/filtering_stream.hpp>
#include <boost/iostreams/copy.hpp>
#include <boost/iostreams/filter/bzip2.hpp>
// compile using
// g++ -std=c++11 code.cpp -lboost_iostreams
// run using
// ./a.out < compressed_file
// ./a.out < simple_file
// cat file_name | ./a.out
std::string BZ2 = ".bzip2";
std::string NO_EXT = "";
void uncompress(std::vector<char> & line, const std::string & file_ext){
std::string str(line.begin(), line.end());
std::cout << "size of line is " << str.length() <<std::endl;
std::stringstream input(str);
std::stringstream decompressed;
boost::iostreams::filtering_istream in;
if (file_ext == NO_EXT) {return;}
if (file_ext == BZ2) {
in.push(boost::iostreams::bzip2_decompressor());
in.push(input);
boost::iostreams::copy(in, decompressed);
decompressed >> str;
line.clear();
std::copy(str.begin(),str.end(),std::back_inserter(line));
}
}
std::vector<char>&readline(std::istream & stream, std::vector<char> & container) {
char c;
container.clear();
while (stream && stream.get(c)) {
container.push_back(c);
if (c == '\n') break;
}
return container;
}
std::string get_ext(const std::vector<char> &line) { // working fine
std::vector<std::pair<std::vector<char>, std::string>> types = { { {66, 90, 104}, BZ2} };// magic char of bzip file
for (auto & type : types) if (std::equal(type.first.begin(), type.first.end(), line.begin())) return type.second;
return NO_EXT;
}
void print_line(std::vector<char> &line) { //working fine
std::string str(line.begin(), line.end());
std::cout << str << std::endl;
}
int main () {
std::vector<char> line;
readline(std::cin, line);
std::string file_ext = get_ext(line); //obitain the file extension
uncompress(line, file_ext);
print_line(line);
while (readline(std::cin, line).size() != 0) {
uncompress(line, file_ext);
print_line(line);
}
}
there is a problem with this code. While reading compressed file. It is reading whole of the compressed file. I don't want to load whole file into memory just to test the file_type.
file size may be greater than 4 GB
If by some way I could figure out the file_type then It will be pretty easy for me to do so.
std::string BZ2 = ".bzip2";
std::string NO_EXT = "";
void uncompress(std::istream & input,
const std::string & file_ext,
boost::iostreams::filtering_istream & in)
{
if (file_ext == BZ2) {
in.push(boost::iostreams::bzip2_decompressor());
}
in.push(input);
}
std::vector<char>&readline(boost::iostreams::filtering_istream & stream, std::vector<char> & container) {
char c;
container.clear();
while (stream && stream.get(c)) {
container.push_back(c);
if (c == '\n') break;
}
return container;
}
std::string get_ext(const std::vector<char> &line) { // working fine
std::vector<std::pair<std::vector<char>, std::string>> types = { { { 66, 90, 104 }, BZ2 } };
for (auto & type : types) if (std::equal(type.first.begin(), type.first.end(), line.begin())) return type.second;
return NO_EXT;
}
void print_line(std::vector<char> &line) { //working fine
std::string str(line.begin(), line.end());
std::cout << str << std::endl;
}
int main () {
std::vector<char> line;
boost::iostreams::filtering_istream in;
std::string file_ext = BZ2; // suppose I already knew that beforehand
uncompress(std::cin, file_ext, in);
while (readline(in, line).size() != 0) {
print_line(line);
}
}
I am getting no idea how to know that before hand. Or any other approach.
magic
andcompressed_magic
(just to make sure, a plain text could perhaps begin with "BZ"), seek back to beginning (or just reopen the file) and then pass it to the appropriate handler. That is, unless you're suggesting that the compressed file is actually a number of bz2 compressed chunks separated by newlines. – Accustomreopen the file
as I am not always providing file as file. it may be likecat filename | ./a.out
– Yeung