I also ran into this problem, and here's the solution I came up with for now. It's not perfect, but hopefully it'll be helpful. For reference, I'm using Java 1.7 and AWS Java SDK version 1.9.13.
Note that this code assumes that you're waiting for the cluster to terminate, not the steps strictly speaking; if your cluster terminates when all your steps are done this is alright, but if you're using clusters that stay alive after step completion this won't help you too much.
Also, note that this code monitors and logs cluster state changes, and in addition diagnoses whether the cluster terminated with errors and throws an exception if it did.
private void yourMainMethod() {
RunJobFlowRequest request = ...;
try {
RunJobFlowResult submission = emr.runJobFlow(request);
String jobFlowId = submission.getJobFlowId();
log.info("Submitted EMR job as job flow id {}", jobFlowId);
DescribeClusterResult result =
waitForCompletion(emr, jobFlowId, 90, TimeUnit.SECONDS);
diagnoseClusterResult(result, jobFlowId);
} finally {
emr.shutdown();
}
}
private DescribeClusterResult waitForCompletion(
AmazonElasticMapReduceClient emr, String jobFlowId,
long sleepTime, TimeUnit timeUnit)
throws InterruptedException {
String state = "STARTING";
while (true) {
DescribeClusterResult result = emr.describeCluster(
new DescribeClusterRequest().withClusterId(jobFlowId)
);
ClusterStatus status = result.getCluster().getStatus();
String newState = status.getState();
if (!state.equals(newState)) {
log.info("Cluster id {} switched from {} to {}. Reason: {}.",
jobFlowId, state, newState, status.getStateChangeReason());
state = newState;
}
switch (state) {
case "TERMINATED":
case "TERMINATED_WITH_ERRORS":
case "WAITING":
return result;
}
timeUnit.sleep(sleepTime);
}
}
private void diagnoseClusterResult(DescribeClusterResult result, String jobFlowId) {
ClusterStatus status = result.getCluster().getStatus();
ClusterStateChangeReason reason = status.getStateChangeReason();
ClusterStateChangeReasonCode code =
ClusterStateChangeReasonCode.fromValue(reason.getCode());
switch (code) {
case ALL_STEPS_COMPLETED:
log.info("Completed EMR job {}", jobFlowId);
break;
default:
failEMR(jobFlowId, status);
}
}
private static void failEMR(String jobFlowId, ClusterStatus status) {
String msg = "EMR cluster run %s terminated with errors. ClusterStatus = %s";
throw new RuntimeException(String.format(msg, jobFlowId, status));
}
jobAttributes
as given.DescribeJobFlowsRequest jobAttributes = new DescribeJobFlowRequest().withJobFlowStates( "COMPLETED", "TERMINATED", "FAILED" );
– Tody