001 /**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License. You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019 package org.apache.hadoop.io;
020
021 import java.io.IOException;
022 import java.io.DataInput;
023 import java.io.DataOutput;
024 import java.nio.ByteBuffer;
025 import java.nio.CharBuffer;
026 import java.nio.charset.CharacterCodingException;
027 import java.nio.charset.Charset;
028 import java.nio.charset.CharsetDecoder;
029 import java.nio.charset.CharsetEncoder;
030 import java.nio.charset.CodingErrorAction;
031 import java.nio.charset.MalformedInputException;
032 import java.text.CharacterIterator;
033 import java.text.StringCharacterIterator;
034 import java.util.Arrays;
035
036 import org.apache.avro.reflect.Stringable;
037
038 import org.apache.hadoop.classification.InterfaceAudience;
039 import org.apache.hadoop.classification.InterfaceStability;
040
041 /** This class stores text using standard UTF8 encoding. It provides methods
042 * to serialize, deserialize, and compare texts at byte level. The type of
043 * length is integer and is serialized using zero-compressed format. <p>In
044 * addition, it provides methods for string traversal without converting the
045 * byte array to a string. <p>Also includes utilities for
046 * serializing/deserialing a string, coding/decoding a string, checking if a
047 * byte array contains valid UTF8 code, calculating the length of an encoded
048 * string.
049 */
050 @Stringable
051 @InterfaceAudience.Public
052 @InterfaceStability.Stable
053 public class Text extends BinaryComparable
054 implements WritableComparable<BinaryComparable> {
055
056 private static ThreadLocal<CharsetEncoder> ENCODER_FACTORY =
057 new ThreadLocal<CharsetEncoder>() {
058 protected CharsetEncoder initialValue() {
059 return Charset.forName("UTF-8").newEncoder().
060 onMalformedInput(CodingErrorAction.REPORT).
061 onUnmappableCharacter(CodingErrorAction.REPORT);
062 }
063 };
064
065 private static ThreadLocal<CharsetDecoder> DECODER_FACTORY =
066 new ThreadLocal<CharsetDecoder>() {
067 protected CharsetDecoder initialValue() {
068 return Charset.forName("UTF-8").newDecoder().
069 onMalformedInput(CodingErrorAction.REPORT).
070 onUnmappableCharacter(CodingErrorAction.REPORT);
071 }
072 };
073
074 private static final byte [] EMPTY_BYTES = new byte[0];
075
076 private byte[] bytes;
077 private int length;
078
079 public Text() {
080 bytes = EMPTY_BYTES;
081 }
082
083 /** Construct from a string.
084 */
085 public Text(String string) {
086 set(string);
087 }
088
089 /** Construct from another text. */
090 public Text(Text utf8) {
091 set(utf8);
092 }
093
094 /** Construct from a byte array.
095 */
096 public Text(byte[] utf8) {
097 set(utf8);
098 }
099
100 /**
101 * Get a copy of the bytes that is exactly the length of the data.
102 * See {@link #getBytes()} for faster access to the underlying array.
103 */
104 public byte[] copyBytes() {
105 byte[] result = new byte[length];
106 System.arraycopy(bytes, 0, result, 0, length);
107 return result;
108 }
109
110 /**
111 * Returns the raw bytes; however, only data up to {@link #getLength()} is
112 * valid. Please use {@link #copyBytes()} if you
113 * need the returned array to be precisely the length of the data.
114 */
115 public byte[] getBytes() {
116 return bytes;
117 }
118
119 /** Returns the number of bytes in the byte array */
120 public int getLength() {
121 return length;
122 }
123
124 /**
125 * Returns the Unicode Scalar Value (32-bit integer value)
126 * for the character at <code>position</code>. Note that this
127 * method avoids using the converter or doing String instatiation
128 * @return the Unicode scalar value at position or -1
129 * if the position is invalid or points to a
130 * trailing byte
131 */
132 public int charAt(int position) {
133 if (position > this.length) return -1; // too long
134 if (position < 0) return -1; // duh.
135
136 ByteBuffer bb = (ByteBuffer)ByteBuffer.wrap(bytes).position(position);
137 return bytesToCodePoint(bb.slice());
138 }
139
140 public int find(String what) {
141 return find(what, 0);
142 }
143
144 /**
145 * Finds any occurence of <code>what</code> in the backing
146 * buffer, starting as position <code>start</code>. The starting
147 * position is measured in bytes and the return value is in
148 * terms of byte position in the buffer. The backing buffer is
149 * not converted to a string for this operation.
150 * @return byte position of the first occurence of the search
151 * string in the UTF-8 buffer or -1 if not found
152 */
153 public int find(String what, int start) {
154 try {
155 ByteBuffer src = ByteBuffer.wrap(this.bytes,0,this.length);
156 ByteBuffer tgt = encode(what);
157 byte b = tgt.get();
158 src.position(start);
159
160 while (src.hasRemaining()) {
161 if (b == src.get()) { // matching first byte
162 src.mark(); // save position in loop
163 tgt.mark(); // save position in target
164 boolean found = true;
165 int pos = src.position()-1;
166 while (tgt.hasRemaining()) {
167 if (!src.hasRemaining()) { // src expired first
168 tgt.reset();
169 src.reset();
170 found = false;
171 break;
172 }
173 if (!(tgt.get() == src.get())) {
174 tgt.reset();
175 src.reset();
176 found = false;
177 break; // no match
178 }
179 }
180 if (found) return pos;
181 }
182 }
183 return -1; // not found
184 } catch (CharacterCodingException e) {
185 // can't get here
186 e.printStackTrace();
187 return -1;
188 }
189 }
190 /** Set to contain the contents of a string.
191 */
192 public void set(String string) {
193 try {
194 ByteBuffer bb = encode(string, true);
195 bytes = bb.array();
196 length = bb.limit();
197 }catch(CharacterCodingException e) {
198 throw new RuntimeException("Should not have happened ", e);
199 }
200 }
201
202 /** Set to a utf8 byte array
203 */
204 public void set(byte[] utf8) {
205 set(utf8, 0, utf8.length);
206 }
207
208 /** copy a text. */
209 public void set(Text other) {
210 set(other.getBytes(), 0, other.getLength());
211 }
212
213 /**
214 * Set the Text to range of bytes
215 * @param utf8 the data to copy from
216 * @param start the first position of the new string
217 * @param len the number of bytes of the new string
218 */
219 public void set(byte[] utf8, int start, int len) {
220 setCapacity(len, false);
221 System.arraycopy(utf8, start, bytes, 0, len);
222 this.length = len;
223 }
224
225 /**
226 * Append a range of bytes to the end of the given text
227 * @param utf8 the data to copy from
228 * @param start the first position to append from utf8
229 * @param len the number of bytes to append
230 */
231 public void append(byte[] utf8, int start, int len) {
232 setCapacity(length + len, true);
233 System.arraycopy(utf8, start, bytes, length, len);
234 length += len;
235 }
236
237 /**
238 * Clear the string to empty.
239 */
240 public void clear() {
241 length = 0;
242 }
243
244 /*
245 * Sets the capacity of this Text object to <em>at least</em>
246 * <code>len</code> bytes. If the current buffer is longer,
247 * then the capacity and existing content of the buffer are
248 * unchanged. If <code>len</code> is larger
249 * than the current capacity, the Text object's capacity is
250 * increased to match.
251 * @param len the number of bytes we need
252 * @param keepData should the old data be kept
253 */
254 private void setCapacity(int len, boolean keepData) {
255 if (bytes == null || bytes.length < len) {
256 if (bytes != null && keepData) {
257 bytes = Arrays.copyOf(bytes, Math.max(len,length << 1));
258 } else {
259 bytes = new byte[len];
260 }
261 }
262 }
263
264 /**
265 * Convert text back to string
266 * @see java.lang.Object#toString()
267 */
268 @Override
269 public String toString() {
270 try {
271 return decode(bytes, 0, length);
272 } catch (CharacterCodingException e) {
273 throw new RuntimeException("Should not have happened " , e);
274 }
275 }
276
277 /** deserialize
278 */
279 public void readFields(DataInput in) throws IOException {
280 int newLength = WritableUtils.readVInt(in);
281 setCapacity(newLength, false);
282 in.readFully(bytes, 0, newLength);
283 length = newLength;
284 }
285
286 /** Skips over one Text in the input. */
287 public static void skip(DataInput in) throws IOException {
288 int length = WritableUtils.readVInt(in);
289 WritableUtils.skipFully(in, length);
290 }
291
292 /** serialize
293 * write this object to out
294 * length uses zero-compressed encoding
295 * @see Writable#write(DataOutput)
296 */
297 public void write(DataOutput out) throws IOException {
298 WritableUtils.writeVInt(out, length);
299 out.write(bytes, 0, length);
300 }
301
302 /** Returns true iff <code>o</code> is a Text with the same contents. */
303 public boolean equals(Object o) {
304 if (o instanceof Text)
305 return super.equals(o);
306 return false;
307 }
308
309 @Override
310 public int hashCode() {
311 return super.hashCode();
312 }
313
314 /** A WritableComparator optimized for Text keys. */
315 public static class Comparator extends WritableComparator {
316 public Comparator() {
317 super(Text.class);
318 }
319
320 public int compare(byte[] b1, int s1, int l1,
321 byte[] b2, int s2, int l2) {
322 int n1 = WritableUtils.decodeVIntSize(b1[s1]);
323 int n2 = WritableUtils.decodeVIntSize(b2[s2]);
324 return compareBytes(b1, s1+n1, l1-n1, b2, s2+n2, l2-n2);
325 }
326 }
327
328 static {
329 // register this comparator
330 WritableComparator.define(Text.class, new Comparator());
331 }
332
333 /// STATIC UTILITIES FROM HERE DOWN
334 /**
335 * Converts the provided byte array to a String using the
336 * UTF-8 encoding. If the input is malformed,
337 * replace by a default value.
338 */
339 public static String decode(byte[] utf8) throws CharacterCodingException {
340 return decode(ByteBuffer.wrap(utf8), true);
341 }
342
343 public static String decode(byte[] utf8, int start, int length)
344 throws CharacterCodingException {
345 return decode(ByteBuffer.wrap(utf8, start, length), true);
346 }
347
348 /**
349 * Converts the provided byte array to a String using the
350 * UTF-8 encoding. If <code>replace</code> is true, then
351 * malformed input is replaced with the
352 * substitution character, which is U+FFFD. Otherwise the
353 * method throws a MalformedInputException.
354 */
355 public static String decode(byte[] utf8, int start, int length, boolean replace)
356 throws CharacterCodingException {
357 return decode(ByteBuffer.wrap(utf8, start, length), replace);
358 }
359
360 private static String decode(ByteBuffer utf8, boolean replace)
361 throws CharacterCodingException {
362 CharsetDecoder decoder = DECODER_FACTORY.get();
363 if (replace) {
364 decoder.onMalformedInput(
365 java.nio.charset.CodingErrorAction.REPLACE);
366 decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
367 }
368 String str = decoder.decode(utf8).toString();
369 // set decoder back to its default value: REPORT
370 if (replace) {
371 decoder.onMalformedInput(CodingErrorAction.REPORT);
372 decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
373 }
374 return str;
375 }
376
377 /**
378 * Converts the provided String to bytes using the
379 * UTF-8 encoding. If the input is malformed,
380 * invalid chars are replaced by a default value.
381 * @return ByteBuffer: bytes stores at ByteBuffer.array()
382 * and length is ByteBuffer.limit()
383 */
384
385 public static ByteBuffer encode(String string)
386 throws CharacterCodingException {
387 return encode(string, true);
388 }
389
390 /**
391 * Converts the provided String to bytes using the
392 * UTF-8 encoding. If <code>replace</code> is true, then
393 * malformed input is replaced with the
394 * substitution character, which is U+FFFD. Otherwise the
395 * method throws a MalformedInputException.
396 * @return ByteBuffer: bytes stores at ByteBuffer.array()
397 * and length is ByteBuffer.limit()
398 */
399 public static ByteBuffer encode(String string, boolean replace)
400 throws CharacterCodingException {
401 CharsetEncoder encoder = ENCODER_FACTORY.get();
402 if (replace) {
403 encoder.onMalformedInput(CodingErrorAction.REPLACE);
404 encoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
405 }
406 ByteBuffer bytes =
407 encoder.encode(CharBuffer.wrap(string.toCharArray()));
408 if (replace) {
409 encoder.onMalformedInput(CodingErrorAction.REPORT);
410 encoder.onUnmappableCharacter(CodingErrorAction.REPORT);
411 }
412 return bytes;
413 }
414
415 /** Read a UTF8 encoded string from in
416 */
417 public static String readString(DataInput in) throws IOException {
418 int length = WritableUtils.readVInt(in);
419 byte [] bytes = new byte[length];
420 in.readFully(bytes, 0, length);
421 return decode(bytes);
422 }
423
424 /** Write a UTF8 encoded string to out
425 */
426 public static int writeString(DataOutput out, String s) throws IOException {
427 ByteBuffer bytes = encode(s);
428 int length = bytes.limit();
429 WritableUtils.writeVInt(out, length);
430 out.write(bytes.array(), 0, length);
431 return length;
432 }
433
434 ////// states for validateUTF8
435
436 private static final int LEAD_BYTE = 0;
437
438 private static final int TRAIL_BYTE_1 = 1;
439
440 private static final int TRAIL_BYTE = 2;
441
442 /**
443 * Check if a byte array contains valid utf-8
444 * @param utf8 byte array
445 * @throws MalformedInputException if the byte array contains invalid utf-8
446 */
447 public static void validateUTF8(byte[] utf8) throws MalformedInputException {
448 validateUTF8(utf8, 0, utf8.length);
449 }
450
451 /**
452 * Check to see if a byte array is valid utf-8
453 * @param utf8 the array of bytes
454 * @param start the offset of the first byte in the array
455 * @param len the length of the byte sequence
456 * @throws MalformedInputException if the byte array contains invalid bytes
457 */
458 public static void validateUTF8(byte[] utf8, int start, int len)
459 throws MalformedInputException {
460 int count = start;
461 int leadByte = 0;
462 int length = 0;
463 int state = LEAD_BYTE;
464 while (count < start+len) {
465 int aByte = ((int) utf8[count] & 0xFF);
466
467 switch (state) {
468 case LEAD_BYTE:
469 leadByte = aByte;
470 length = bytesFromUTF8[aByte];
471
472 switch (length) {
473 case 0: // check for ASCII
474 if (leadByte > 0x7F)
475 throw new MalformedInputException(count);
476 break;
477 case 1:
478 if (leadByte < 0xC2 || leadByte > 0xDF)
479 throw new MalformedInputException(count);
480 state = TRAIL_BYTE_1;
481 break;
482 case 2:
483 if (leadByte < 0xE0 || leadByte > 0xEF)
484 throw new MalformedInputException(count);
485 state = TRAIL_BYTE_1;
486 break;
487 case 3:
488 if (leadByte < 0xF0 || leadByte > 0xF4)
489 throw new MalformedInputException(count);
490 state = TRAIL_BYTE_1;
491 break;
492 default:
493 // too long! Longest valid UTF-8 is 4 bytes (lead + three)
494 // or if < 0 we got a trail byte in the lead byte position
495 throw new MalformedInputException(count);
496 } // switch (length)
497 break;
498
499 case TRAIL_BYTE_1:
500 if (leadByte == 0xF0 && aByte < 0x90)
501 throw new MalformedInputException(count);
502 if (leadByte == 0xF4 && aByte > 0x8F)
503 throw new MalformedInputException(count);
504 if (leadByte == 0xE0 && aByte < 0xA0)
505 throw new MalformedInputException(count);
506 if (leadByte == 0xED && aByte > 0x9F)
507 throw new MalformedInputException(count);
508 // falls through to regular trail-byte test!!
509 case TRAIL_BYTE:
510 if (aByte < 0x80 || aByte > 0xBF)
511 throw new MalformedInputException(count);
512 if (--length == 0) {
513 state = LEAD_BYTE;
514 } else {
515 state = TRAIL_BYTE;
516 }
517 break;
518 } // switch (state)
519 count++;
520 }
521 }
522
523 /**
524 * Magic numbers for UTF-8. These are the number of bytes
525 * that <em>follow</em> a given lead byte. Trailing bytes
526 * have the value -1. The values 4 and 5 are presented in
527 * this table, even though valid UTF-8 cannot include the
528 * five and six byte sequences.
529 */
530 static final int[] bytesFromUTF8 =
531 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
532 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
533 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
534 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
535 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
536 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
537 0, 0, 0, 0, 0, 0, 0,
538 // trail bytes
539 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
540 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
541 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
542 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1,
543 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
544 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3,
545 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 };
546
547 /**
548 * Returns the next code point at the current position in
549 * the buffer. The buffer's position will be incremented.
550 * Any mark set on this buffer will be changed by this method!
551 */
552 public static int bytesToCodePoint(ByteBuffer bytes) {
553 bytes.mark();
554 byte b = bytes.get();
555 bytes.reset();
556 int extraBytesToRead = bytesFromUTF8[(b & 0xFF)];
557 if (extraBytesToRead < 0) return -1; // trailing byte!
558 int ch = 0;
559
560 switch (extraBytesToRead) {
561 case 5: ch += (bytes.get() & 0xFF); ch <<= 6; /* remember, illegal UTF-8 */
562 case 4: ch += (bytes.get() & 0xFF); ch <<= 6; /* remember, illegal UTF-8 */
563 case 3: ch += (bytes.get() & 0xFF); ch <<= 6;
564 case 2: ch += (bytes.get() & 0xFF); ch <<= 6;
565 case 1: ch += (bytes.get() & 0xFF); ch <<= 6;
566 case 0: ch += (bytes.get() & 0xFF);
567 }
568 ch -= offsetsFromUTF8[extraBytesToRead];
569
570 return ch;
571 }
572
573
574 static final int offsetsFromUTF8[] =
575 { 0x00000000, 0x00003080,
576 0x000E2080, 0x03C82080, 0xFA082080, 0x82082080 };
577
578 /**
579 * For the given string, returns the number of UTF-8 bytes
580 * required to encode the string.
581 * @param string text to encode
582 * @return number of UTF-8 bytes required to encode
583 */
584 public static int utf8Length(String string) {
585 CharacterIterator iter = new StringCharacterIterator(string);
586 char ch = iter.first();
587 int size = 0;
588 while (ch != CharacterIterator.DONE) {
589 if ((ch >= 0xD800) && (ch < 0xDC00)) {
590 // surrogate pair?
591 char trail = iter.next();
592 if ((trail > 0xDBFF) && (trail < 0xE000)) {
593 // valid pair
594 size += 4;
595 } else {
596 // invalid pair
597 size += 3;
598 iter.previous(); // rewind one
599 }
600 } else if (ch < 0x80) {
601 size++;
602 } else if (ch < 0x800) {
603 size += 2;
604 } else {
605 // ch < 0x10000, that is, the largest char value
606 size += 3;
607 }
608 ch = iter.next();
609 }
610 return size;
611 }
612 }