001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * https://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.codec.language; 019 020import java.util.Arrays; 021import java.util.Locale; 022 023import org.apache.commons.codec.EncoderException; 024import org.apache.commons.codec.StringEncoder; 025 026/** 027 * Encodes a string into a Cologne Phonetic value. 028 * <p> 029 * Implements the <a href="https://de.wikipedia.org/wiki/K%C3%B6lner_Phonetik">Kölner Phonetik</a> 030 * (<a href="https://en.wikipedia.org/wiki/Cologne_phonetics">Cologne phonetics</a>) algorithm issued by Hans Joachim Postel in 1969. 031 * </p> 032 * <p> 033 * The <em>Kölner Phonetik</em> is a phonetic algorithm which is optimized for the German language. It is related to 034 * the well-known Soundex algorithm. 035 * </p> 036 * 037 * <h2>Algorithm</h2> 038 * 039 * <ul> 040 * 041 * <li> 042 * <h3>Step 1:</h3> 043 * After preprocessing (conversion to upper case, transcription of <a 044 * href="https://en.wikipedia.org/wiki/Germanic_umlaut">germanic umlauts</a>, removal of non alphabetical characters) the 045 * letters of the supplied text are replaced by their phonetic code according to the following table. 046 * <table border="1"> 047 * <caption style="caption-side: bottom"><small><i>(Source: <a 048 * href="https://de.wikipedia.org/wiki/K%C3%B6lner_Phonetik#Buchstabencodes">Wikipedia (de): Kölner Phonetik -- 049 * Buchstabencodes</a>)</i></small></caption> <tbody> 050 * <tr> 051 * <th>Letter</th> 052 * <th>Context</th> 053 * <th>Code</th> 054 * </tr> 055 * <tr> 056 * <td>A, E, I, J, O, U, Y</td> 057 * <td></td> 058 * <td>0</td> 059 * </tr> 060 * <tr> 061 * 062 * <td>H</td> 063 * <td></td> 064 * <td>-</td> 065 * </tr> 066 * <tr> 067 * <td>B</td> 068 * <td></td> 069 * <td rowspan="2">1</td> 070 * </tr> 071 * <tr> 072 * <td>P</td> 073 * <td>not before H</td> 074 * 075 * </tr> 076 * <tr> 077 * <td>D, T</td> 078 * <td>not before C, S, Z</td> 079 * <td>2</td> 080 * </tr> 081 * <tr> 082 * <td>F, V, W</td> 083 * <td></td> 084 * <td rowspan="2">3</td> 085 * </tr> 086 * <tr> 087 * 088 * <td>P</td> 089 * <td>before H</td> 090 * </tr> 091 * <tr> 092 * <td>G, K, Q</td> 093 * <td></td> 094 * <td rowspan="3">4</td> 095 * </tr> 096 * <tr> 097 * <td rowspan="2">C</td> 098 * <td>at onset before A, H, K, L, O, Q, R, U, X</td> 099 * 100 * </tr> 101 * <tr> 102 * <td>before A, H, K, O, Q, U, X except after S, Z</td> 103 * </tr> 104 * <tr> 105 * <td>X</td> 106 * <td>not after C, K, Q</td> 107 * <td>48</td> 108 * </tr> 109 * <tr> 110 * <td>L</td> 111 * <td></td> 112 * 113 * <td>5</td> 114 * </tr> 115 * <tr> 116 * <td>M, N</td> 117 * <td></td> 118 * <td>6</td> 119 * </tr> 120 * <tr> 121 * <td>R</td> 122 * <td></td> 123 * <td>7</td> 124 * </tr> 125 * 126 * <tr> 127 * <td>S, Z</td> 128 * <td></td> 129 * <td rowspan="6">8</td> 130 * </tr> 131 * <tr> 132 * <td rowspan="3">C</td> 133 * <td>after S, Z</td> 134 * </tr> 135 * <tr> 136 * <td>at onset except before A, H, K, L, O, Q, R, U, X</td> 137 * </tr> 138 * 139 * <tr> 140 * <td>not before A, H, K, O, Q, U, X</td> 141 * </tr> 142 * <tr> 143 * <td>D, T</td> 144 * <td>before C, S, Z</td> 145 * </tr> 146 * <tr> 147 * <td>X</td> 148 * <td>after C, K, Q</td> 149 * </tr> 150 * </tbody> 151 * </table> 152 * 153 * <h4>Example:</h4> 154 * 155 * {@code "M}ü{@code ller-L}ü{@code denscheidt"} -> {@code "MULLERLUDENSCHEIDT"} -> {@code "6005507500206880022"} 156 * 157 * </li> 158 * 159 * <li> 160 * <h3>Step 2:</h3> 161 * Collapse of all multiple consecutive code digits. 162 * <h4>Example:</h4> 163 * {@code "6005507500206880022"} -> {@code "6050750206802"}</li> 164 * 165 * <li> 166 * <h3>Step 3:</h3> 167 * Removal of all codes "0" except at the beginning. This means that two or more identical consecutive digits can occur 168 * if they occur after removing the "0" digits. 169 * 170 * <h4>Example:</h4> 171 * {@code "6050750206802"} -> {@code "65752682"}</li> 172 * 173 * </ul> 174 * 175 * <p> 176 * This class is thread-safe. 177 * </p> 178 * 179 * @see <a href="https://en.wikipedia.org/wiki/Cologne_phonetics">Wikipedia: Cologne phonetics</a> 180 * @see <a href="https://de.wikipedia.org/wiki/K%C3%B6lner_Phonetik">Wikipedia (de): Kölner Phonetik (in German)</a> 181 * @since 1.5 182 */ 183public class ColognePhonetic implements StringEncoder { 184 185 /** 186 * This class is not thread-safe; the field {@link #length} is mutable. However, it is not shared between threads, as it is constructed on demand by the 187 * method {@link ColognePhonetic#colognePhonetic(String)}. 188 */ 189 private abstract static class CologneBuffer { 190 191 protected final char[] data; 192 193 protected int length; 194 195 protected CologneBuffer(final char[] data) { 196 this.data = data; 197 this.length = data.length; 198 } 199 200 protected CologneBuffer(final int buffSize) { 201 this.data = new char[buffSize]; 202 this.length = 0; 203 } 204 205 protected abstract char[] copyData(int start, int length); 206 207 boolean isEmpty() { 208 return length() == 0; 209 } 210 211 int length() { 212 return length; 213 } 214 215 @Override 216 public String toString() { 217 return new String(copyData(0, length)); 218 } 219 } 220 221 private final class CologneInputBuffer extends CologneBuffer { 222 223 CologneInputBuffer(final char[] data) { 224 super(data); 225 } 226 227 @Override 228 protected char[] copyData(final int start, final int length) { 229 final char[] newData = new char[length]; 230 System.arraycopy(data, data.length - this.length + start, newData, 0, length); 231 return newData; 232 } 233 234 char getNextChar() { 235 return data[getNextPos()]; 236 } 237 238 protected int getNextPos() { 239 return data.length - length; 240 } 241 242 char removeNext() { 243 final char ch = getNextChar(); 244 length--; 245 return ch; 246 } 247 } 248 249 private final class CologneOutputBuffer extends CologneBuffer { 250 251 private char lastCode; 252 253 CologneOutputBuffer(final int buffSize) { 254 super(buffSize); 255 lastCode = '/'; // impossible value 256 } 257 258 @Override 259 protected char[] copyData(final int start, final int length) { 260 return Arrays.copyOfRange(data, start, length); 261 } 262 263 /** 264 * Stores the next code in the output buffer, keeping track of the previous code. '0' is only stored if it is the first entry. Ignored chars are never 265 * stored. If the code is the same as the last code (whether stored or not) it is not stored. 266 * 267 * @param code the code to store. 268 */ 269 void put(final char code) { 270 final boolean accept = code != CHAR_IGNORE; 271 final boolean nonZ = code != '0'; 272 if (accept && lastCode != code && (nonZ || length == 0)) { 273 data[length] = code; 274 length++; 275 } 276 if (nonZ && accept) { 277 lastCode = code; 278 } 279 } 280 } 281 // Predefined char arrays for better performance and less GC load 282 private static final char[] AEIJOUY = { 'A', 'E', 'I', 'J', 'O', 'U', 'Y' }; 283 private static final char[] CSZ = { 'C', 'S', 'Z' }; 284 private static final char[] FPVW = { 'F', 'P', 'V', 'W' }; 285 private static final char[] GKQ = { 'G', 'K', 'Q' }; 286 private static final char[] CKQ = { 'C', 'K', 'Q' }; 287 private static final char[] AHKLOQRUX = { 'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X' }; 288 289 private static final char[] SZ = { 'S', 'Z' }; 290 291 private static final char[] AHKOQUX = { 'A', 'H', 'K', 'O', 'Q', 'U', 'X' }; 292 293 private static final char[] DTX = { 'D', 'T', 'X' }; 294 295 private static final char CHAR_IGNORE = '-'; // is this character to be ignored? 296 297 /* 298 * Returns whether the array contains the key, or not. 299 */ 300 private static boolean arrayContains(final char[] arr, final char key) { 301 for (final char element : arr) { 302 if (element == key) { 303 return true; 304 } 305 } 306 return false; 307 } 308 309 /** 310 * Constructs a new instance. 311 */ 312 public ColognePhonetic() { 313 // empty 314 } 315 316 /** 317 * <p> 318 * Implements the <em>Kölner Phonetik</em> algorithm. 319 * </p> 320 * <p> 321 * In contrast to the initial description of the algorithm, this implementation does the encoding in one pass. 322 * </p> 323 * 324 * @param text The source text to encode. 325 * @return the corresponding encoding according to the <em>Kölner Phonetik</em> algorithm. 326 */ 327 public String colognePhonetic(final String text) { 328 if (text == null) { 329 return null; 330 } 331 final CologneInputBuffer input = new CologneInputBuffer(preprocess(text)); 332 final CologneOutputBuffer output = new CologneOutputBuffer(input.length() * 2); 333 char nextChar; 334 char lastChar = CHAR_IGNORE; 335 char chr; 336 while (!input.isEmpty()) { 337 chr = input.removeNext(); 338 if (!input.isEmpty()) { 339 nextChar = input.getNextChar(); 340 } else { 341 nextChar = CHAR_IGNORE; 342 } 343 if (chr < 'A' || chr > 'Z') { 344 continue; // ignore unwanted characters 345 } 346 if (arrayContains(AEIJOUY, chr)) { 347 output.put('0'); 348 } else if (chr == 'B' || chr == 'P' && nextChar != 'H') { 349 output.put('1'); 350 } else if ((chr == 'D' || chr == 'T') && !arrayContains(CSZ, nextChar)) { 351 output.put('2'); 352 } else if (arrayContains(FPVW, chr)) { 353 output.put('3'); 354 } else if (arrayContains(GKQ, chr)) { 355 output.put('4'); 356 } else if (chr == 'X' && !arrayContains(CKQ, lastChar)) { 357 output.put('4'); 358 output.put('8'); 359 } else if (chr == 'S' || chr == 'Z') { 360 output.put('8'); 361 } else if (chr == 'C') { 362 if (output.isEmpty()) { 363 if (arrayContains(AHKLOQRUX, nextChar)) { 364 output.put('4'); 365 } else { 366 output.put('8'); 367 } 368 } else if (arrayContains(SZ, lastChar) || !arrayContains(AHKOQUX, nextChar)) { 369 output.put('8'); 370 } else { 371 output.put('4'); 372 } 373 } else if (arrayContains(DTX, chr)) { 374 output.put('8'); 375 } else { 376 switch (chr) { 377 case 'R': 378 output.put('7'); 379 break; 380 case 'L': 381 output.put('5'); 382 break; 383 case 'M': 384 case 'N': 385 output.put('6'); 386 break; 387 case 'H': 388 output.put(CHAR_IGNORE); // needed by put 389 break; 390 default: 391 break; 392 } 393 } 394 lastChar = chr; 395 } 396 return output.toString(); 397 } 398 399 @Override 400 public Object encode(final Object object) throws EncoderException { 401 if (!(object instanceof String)) { 402 throw new EncoderException(String.format("This method's parameter was expected to be of the type %s. But actually it was of the type %s.", 403 String.class.getName(), object.getClass().getName())); 404 } 405 return encode((String) object); 406 } 407 408 @Override 409 public String encode(final String text) { 410 return colognePhonetic(text); 411 } 412 413 /** 414 * Compares the first encoded string to the second encoded string. 415 * 416 * @param text1 source text to encode before testing for equality. 417 * @param text2 source text to encode before testing for equality. 418 * @return {@code true} if the encoding the first string equals the encoding of the second string, {@code false} otherwise. 419 */ 420 public boolean isEncodeEqual(final String text1, final String text2) { 421 return colognePhonetic(text1).equals(colognePhonetic(text2)); 422 } 423 424 /** 425 * Converts the string to upper case and replaces Germanic umlaut characters The following characters are mapped: 426 * <ul> 427 * <li>capital A, umlaut mark</li> 428 * <li>capital U, umlaut mark</li> 429 * <li>capital O, umlaut mark</li> 430 * <li>small sharp s, German</li> 431 * </ul> 432 */ 433 private char[] preprocess(final String text) { 434 // This converts German small sharp s (Eszett) to SS 435 final char[] chrs = text.toUpperCase(Locale.GERMAN).toCharArray(); 436 for (int index = 0; index < chrs.length; index++) { 437 switch (chrs[index]) { 438 case '\u00C4': // capital A, umlaut mark 439 chrs[index] = 'A'; 440 break; 441 case '\u00DC': // capital U, umlaut mark 442 chrs[index] = 'U'; 443 break; 444 case '\u00D6': // capital O, umlaut mark 445 chrs[index] = 'O'; 446 break; 447 default: 448 break; 449 } 450 } 451 return chrs; 452 } 453}