001 /**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License. You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019 package org.apache.hadoop.mapreduce;
020
021 import java.io.IOException;
022
023 import org.apache.hadoop.classification.InterfaceAudience;
024 import org.apache.hadoop.classification.InterfaceStability;
025 /**
026 * <code>OutputCommitter</code> describes the commit of task output for a
027 * Map-Reduce job.
028 *
029 * <p>The Map-Reduce framework relies on the <code>OutputCommitter</code> of
030 * the job to:<p>
031 * <ol>
032 * <li>
033 * Setup the job during initialization. For example, create the temporary
034 * output directory for the job during the initialization of the job.
035 * </li>
036 * <li>
037 * Cleanup the job after the job completion. For example, remove the
038 * temporary output directory after the job completion.
039 * </li>
040 * <li>
041 * Setup the task temporary output.
042 * </li>
043 * <li>
044 * Check whether a task needs a commit. This is to avoid the commit
045 * procedure if a task does not need commit.
046 * </li>
047 * <li>
048 * Commit of the task output.
049 * </li>
050 * <li>
051 * Discard the task commit.
052 * </li>
053 * </ol>
054 * The methods in this class can be called from several different processes and
055 * from several different contexts. It is important to know which process and
056 * which context each is called from. Each method should be marked accordingly
057 * in its documentation.
058 *
059 * @see org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
060 * @see JobContext
061 * @see TaskAttemptContext
062 */
063 @InterfaceAudience.Public
064 @InterfaceStability.Stable
065 public abstract class OutputCommitter {
066 /**
067 * For the framework to setup the job output during initialization. This is
068 * called from the application master process for the entire job.
069 *
070 * @param jobContext Context of the job whose output is being written.
071 * @throws IOException if temporary output could not be created
072 */
073 public abstract void setupJob(JobContext jobContext) throws IOException;
074
075 /**
076 * For cleaning up the job's output after job completion. This is called
077 * from the application master process for the entire job.
078 *
079 * @param jobContext Context of the job whose output is being written.
080 * @throws IOException
081 * @deprecated Use {@link #commitJob(JobContext)} and
082 * {@link #abortJob(JobContext, JobStatus.State)} instead.
083 */
084 @Deprecated
085 public void cleanupJob(JobContext jobContext) throws IOException { }
086
087 /**
088 * For committing job's output after successful job completion. Note that this
089 * is invoked for jobs with final runstate as SUCCESSFUL. This is called
090 * from the application master process for the entire job.
091 *
092 * @param jobContext Context of the job whose output is being written.
093 * @throws IOException
094 */
095 public void commitJob(JobContext jobContext) throws IOException {
096 cleanupJob(jobContext);
097 }
098
099
100 /**
101 * For aborting an unsuccessful job's output. Note that this is invoked for
102 * jobs with final runstate as {@link JobStatus.State#FAILED} or
103 * {@link JobStatus.State#KILLED}. This is called from the application
104 * master process for the entire job.
105 *
106 * @param jobContext Context of the job whose output is being written.
107 * @param state final runstate of the job
108 * @throws IOException
109 */
110 public void abortJob(JobContext jobContext, JobStatus.State state)
111 throws IOException {
112 cleanupJob(jobContext);
113 }
114
115 /**
116 * Sets up output for the task. This is called from each individual task's
117 * process that will output to HDFS, and it is called just for that task.
118 *
119 * @param taskContext Context of the task whose output is being written.
120 * @throws IOException
121 */
122 public abstract void setupTask(TaskAttemptContext taskContext)
123 throws IOException;
124
125 /**
126 * Check whether task needs a commit. This is called from each individual
127 * task's process that will output to HDFS, and it is called just for that
128 * task.
129 *
130 * @param taskContext
131 * @return true/false
132 * @throws IOException
133 */
134 public abstract boolean needsTaskCommit(TaskAttemptContext taskContext)
135 throws IOException;
136
137 /**
138 * To promote the task's temporary output to final output location.
139 * If {@link #needsTaskCommit(TaskAttemptContext)} returns true and this
140 * task is the task that the AM determines finished first, this method
141 * is called to commit an individual task's output. This is to mark
142 * that tasks output as complete, as {@link #commitJob(JobContext)} will
143 * also be called later on if the entire job finished successfully. This
144 * is called from a task's process.
145 *
146 * @param taskContext Context of the task whose output is being written.
147 * @throws IOException if commit is not successful.
148 */
149 public abstract void commitTask(TaskAttemptContext taskContext)
150 throws IOException;
151
152 /**
153 * Discard the task output. This is called from a task's process to clean
154 * up a single task's output that can not yet been committed.
155 *
156 * @param taskContext
157 * @throws IOException
158 */
159 public abstract void abortTask(TaskAttemptContext taskContext)
160 throws IOException;
161
162 /**
163 * Is task output recovery supported for restarting jobs?
164 *
165 * If task output recovery is supported, job restart can be done more
166 * efficiently.
167 *
168 * @return <code>true</code> if task output recovery is supported,
169 * <code>false</code> otherwise
170 * @see #recoverTask(TaskAttemptContext)
171 */
172 public boolean isRecoverySupported() {
173 return false;
174 }
175
176 /**
177 * Recover the task output.
178 *
179 * The retry-count for the job will be passed via the
180 * {@link MRJobConfig#APPLICATION_ATTEMPT_ID} key in
181 * {@link TaskAttemptContext#getConfiguration()} for the
182 * <code>OutputCommitter</code>. This is called from the application master
183 * process, but it is called individually for each task.
184 *
185 * If an exception is thrown the task will be attempted again.
186 *
187 * @param taskContext Context of the task whose output is being recovered
188 * @throws IOException
189 */
190 public void recoverTask(TaskAttemptContext taskContext)
191 throws IOException
192 {}
193 }