My use-case was to copy-paste the parquet file from one S3 location of AWS account A to another S3 location of AWS account B, without using spark. I used below snippet to perform the same.
The key takeaway for me was to use a Bytes array as an intermediate data structure.
Imports:
import com.amazonaws.auth.AWSCredentials;
import com.amazonaws.auth.BasicAWSCredentials;
import com.amazonaws.services.s3.AmazonS3;
import com.amazonaws.services.s3.AmazonS3Client;
import com.amazonaws.services.s3.model.*;
import lombok.extern.slf4j.Slf4j;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
Code snippet:
String inputBucket = "XXXX";
String outputBucket = "XXXX";
String s3Prefix = "XXXX";
String awsAccessKey = "XXXXX";
String awsSecretKey = "XXXXX";
AWSCredentials credentials = new BasicAWSCredentials(awsAccessKey, awsSecretKey);
AmazonS3 s3client = new AmazonS3Client(credentials);
FileSystem fileSystem = FileSystem.get(URI.create("s3a:" + outputBucket), new JobConf(MyClass.class));
FSDataOutputStream fsDataOutputStream = null;
int count = 0;
try {
ObjectListing objectListing = s3client.listObjects(new ListObjectsRequest().withBucketName(inputBucket)
.withPrefix(s3Prefix));
for (S3ObjectSummary objectSummary : objectListing.getObjectSummaries()) {
String fileName = objectSummary.getKey().substring(objectSummary.getKey().lastIndexOf(DELIMITER) + 1);
String outputPath = "s3a:" + outputBucket + s3Prefix + "/" + fileName;
fsDataOutputStream = fileSystem.create(new Path(outputPath));
InputStream inputStream = s3client.getObject(new GetObjectRequest(inputBucket, objectSummary.getKey())).getObjectContent();
byte[] buffer = new byte[4096];
int bytesRead = inputStream.read(buffer);
while (bytesRead != -1) {
fsDataOutputStream.write(buffer, 0, bytesRead);
bytesRead = inputStream.read(buffer);
}
inputStream.close();
fsDataOutputStream.close();
count += 1;
log.info("Successfully downloaded file to temporary path: {}", outputPath);
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if (fsDataOutputStream != null) {
fsDataOutputStream.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}