Coverage Report - org.simpleframework.xml.stream.Splitter
 
Classes in this File Line Coverage Branch Coverage Complexity
Splitter
100%
65/65
95%
38/40
2.308
 
 1  
 /*
 2  
  * Splitter.java July 2008
 3  
  *
 4  
  * Copyright (C) 2008, Niall Gallagher <niallg@users.sf.net>
 5  
  *
 6  
  * Licensed under the Apache License, Version 2.0 (the "License");
 7  
  * you may not use this file except in compliance with the License.
 8  
  * You may obtain a copy of the License at
 9  
  *
 10  
  *     http://www.apache.org/licenses/LICENSE-2.0
 11  
  *
 12  
  * Unless required by applicable law or agreed to in writing, software
 13  
  * distributed under the License is distributed on an "AS IS" BASIS,
 14  
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 
 15  
  * implied. See the License for the specific language governing 
 16  
  * permissions and limitations under the License.
 17  
  */
 18  
 
 19  
 package org.simpleframework.xml.stream;
 20  
 
 21  
 /**
 22  
  * The <code>Splitter</code> object is used split up a string in to
 23  
  * tokens that can be used to create a camel case or hyphenated text
 24  
  * representation of the string. This will preserve acronyms and
 25  
  * numbers and splits tokens by case and character type. Examples
 26  
  * of how a string would be splitted are as follows.
 27  
  * <pre>
 28  
  * 
 29  
  *    CamelCaseString = "Camel" "Case" "String"
 30  
  *    hyphenated-text = "hyphenated" "text"
 31  
  *    URLAcronym      = "URL" "acronym"
 32  
  *    RFC2616.txt     = "RFC" "2616" "txt"
 33  
  * 
 34  
  * </pre>
 35  
  * By splitting strings in to individual words this allows the
 36  
  * splitter to be used to assemble the words in a way that adheres
 37  
  * to a specific style. Each style can then be applied to an XML 
 38  
  * document to give it a consistent format.
 39  
  * 
 40  
  * @author Niall Gallagher
 41  
  * 
 42  
  * @see org.simpleframework.xml.stream.Style
 43  
  */
 44  
 abstract class Splitter {
 45  
 
 46  
    /**
 47  
     * This is the string builder used to build the processed text.
 48  
     */
 49  
    protected StringBuilder builder;
 50  
    
 51  
    /**
 52  
     * This is the original text that is to be split in to words.
 53  
     */
 54  
    protected char[] text;
 55  
    
 56  
    /**
 57  
     * This is the number of characters to be considered for use.
 58  
     */
 59  
    protected int count;
 60  
    
 61  
    /**
 62  
     * This is the current read offset of the text string.
 63  
     */
 64  
    protected int off;
 65  
    
 66  
    /**
 67  
     * Constructor of the <code>Splitter</code> object. This is used
 68  
     * to split the provided string in to individual words so that
 69  
     * they can be assembled as a styled token, which can represent
 70  
     * an XML attribute or element.
 71  
     * 
 72  
     * @param source this is the source that is to be split 
 73  
     */
 74  4180
    public Splitter(String source) {
 75  4180
       this.builder = new StringBuilder();
 76  4180
       this.text = source.toCharArray();
 77  4180
       this.count = text.length;
 78  4180
    }
 79  
    
 80  
    /**
 81  
     * This is used to process the internal string and convert it in
 82  
     * to a styled string. The styled string can then be used as an
 83  
     * XML attribute or element providing a consistent format to the
 84  
     * document that is being generated.
 85  
     * 
 86  
     * @return the string that has been converted to a styled string
 87  
     */
 88  
    public String process() {
 89  9596
       while(off < count) {
 90  5640
          while(off < count) {
 91  5640
             char ch = text[off];
 92  
             
 93  5640
             if(!isSpecial(ch)) {
 94  5416
                break;
 95  
             }
 96  224
             off++;
 97  224
          }
 98  5416
          if(!acronym()) {
 99  5354
             token();
 100  5354
             number();
 101  
          }
 102  
       }
 103  4180
       return builder.toString();
 104  
    }
 105  
    
 106  
    /**
 107  
     * This is used to extract a token from the source string. Once a
 108  
     * token has been extracted the <code>commit</code> method is 
 109  
     * called to add it to the string being build. Each time this is
 110  
     * called a token, if extracted, will be committed to the string.
 111  
     * Before being committed the string is parsed for styling.
 112  
     */
 113  
    private void token() {
 114  5354
       int mark = off;
 115  
       
 116  31523
       while(mark < count) {
 117  27358
          char ch = text[mark];
 118  
          
 119  27358
          if(!isLetter(ch)) {
 120  222
             break;
 121  
          } 
 122  27136
          if(mark > off) {
 123  21791
             if(isUpper(ch)) {
 124  967
                break;
 125  
             }
 126  
          }
 127  26169
          mark++;
 128  26169
       }
 129  5354
       if(mark > off) {
 130  5345
          parse(text, off, mark - off);
 131  5345
          commit(text, off, mark - off);
 132  
       }
 133  5354
       off = mark;
 134  5354
    }
 135  
    
 136  
    /**
 137  
     * This is used to extract a acronym from the source string. Once 
 138  
     * a token has been extracted the <code>commit</code> method is 
 139  
     * called to add it to the string being build. Each time this is
 140  
     * called a token, if extracted, will be committed to the string.
 141  
     * 
 142  
     * @return true if an acronym was extracted from the source
 143  
     */
 144  
    private boolean acronym() { // is it the last one?
 145  5416
       int mark = off;
 146  5416
       int size = 0;
 147  
       
 148  7523
       while(mark < count) {
 149  7437
          char ch = text[mark];
 150  
          
 151  7437
          if(isUpper(ch)) {
 152  2107
             size++;
 153  
          } else {
 154  
             break;
 155  
          }
 156  2107
          mark++;
 157  2107
       }
 158  5416
       if(size > 1) {
 159  62
          if(mark < count) {
 160  53
             char ch = text[mark-1];
 161  
             
 162  53
             if(isUpper(ch)) {
 163  53
                mark--;
 164  
             }
 165  
          }
 166  62
          commit(text, off, mark - off);
 167  62
          off = mark;
 168  
       }
 169  5416
       return size > 1;
 170  
    }
 171  
    
 172  
    /**
 173  
     * This is used to extract a number from the source string. Once 
 174  
     * a token has been extracted the <code>commit</code> method is 
 175  
     * called to add it to the string being build. Each time this is
 176  
     * called a token, if extracted, will be committed to the string.
 177  
     * 
 178  
     * @return true if an number was extracted from the source
 179  
     */
 180  
    private boolean number() {
 181  5354
       int mark = off;
 182  5354
       int size = 0;
 183  
       
 184  5432
       while(mark < count) {
 185  1261
          char ch = text[mark];
 186  
          
 187  1261
          if(isDigit(ch)) {
 188  78
             size++;
 189  
          } else {
 190  
             break;
 191  
          }
 192  78
          mark++;
 193  78
       }
 194  5354
       if(size > 0) {
 195  42
          commit(text, off, mark - off);
 196  
       }
 197  5354
       off = mark;
 198  5354
       return size > 0;
 199  
    }
 200  
 
 201  
    /**
 202  
     * This is used to determine if the provided string evaluates to
 203  
     * a letter character. This delegates to <code>Character</code> 
 204  
     * so that the full range of unicode characters are considered.
 205  
     * 
 206  
     * @param ch this is the character that is to be evaluated
 207  
     * 
 208  
     * @return this returns true if the character is a letter
 209  
     */
 210  
    private boolean isLetter(char ch) {
 211  27358
       return Character.isLetter(ch);
 212  
    }
 213  
    
 214  
    /**
 215  
     * This is used to determine if the provided string evaluates to
 216  
     * a symbol character. This delegates to <code>Character</code> 
 217  
     * so that the full range of unicode characters are considered.
 218  
     * 
 219  
     * @param ch this is the character that is to be evaluated
 220  
     * 
 221  
     * @return this returns true if the character is a symbol
 222  
     */
 223  
    private boolean isSpecial(char ch) {
 224  5640
       return !Character.isLetterOrDigit(ch);
 225  
    }
 226  
    
 227  
    /**
 228  
     * This is used to determine if the provided string evaluates to
 229  
     * a digit character. This delegates to <code>Character</code> 
 230  
     * so that the full range of unicode characters are considered.
 231  
     * 
 232  
     * @param ch this is the character that is to be evaluated
 233  
     * 
 234  
     * @return this returns true if the character is a digit
 235  
     */
 236  
    private boolean isDigit(char ch) {
 237  1261
       return Character.isDigit(ch);
 238  
    }
 239  
    
 240  
    /**
 241  
     * This is used to determine if the provided string evaluates to
 242  
     * an upper case letter. This delegates to <code>Character</code> 
 243  
     * so that the full range of unicode characters are considered.
 244  
     * 
 245  
     * @param ch this is the character that is to be evaluated
 246  
     * 
 247  
     * @return this returns true if the character is upper case
 248  
     */
 249  
    private boolean isUpper(char ch) {
 250  29281
       return Character.isUpperCase(ch);
 251  
    }
 252  
    
 253  
    /**
 254  
     * This is used to convert the provided character to an upper
 255  
     * case character. This delegates to <code>Character</code> to
 256  
     * perform the conversion so unicode characters are considered.
 257  
     * 
 258  
     * @param ch this is the character that is to be converted
 259  
     * 
 260  
     * @return the character converted to upper case
 261  
     */
 262  
    protected char toUpper(char ch) {
 263  2766
       return Character.toUpperCase(ch);
 264  
    }
 265  
    
 266  
    /**
 267  
     * This is used to convert the provided character to a lower
 268  
     * case character. This delegates to <code>Character</code> to
 269  
     * perform the conversion so unicode characters are considered.
 270  
     * 
 271  
     * @param ch this is the character that is to be converted
 272  
     * 
 273  
     * @return the character converted to lower case
 274  
     */
 275  
    protected char toLower(char ch) {
 276  2302
       return Character.toLowerCase(ch);
 277  
    }
 278  
    
 279  
    /**
 280  
     * This is used to parse the provided text in to the style that
 281  
     * is required. Manipulation of the text before committing it
 282  
     * ensures that the text adheres to the required style.
 283  
     * 
 284  
     * @param text this is the text buffer to acquire the token from
 285  
     * @param off this is the offset in the buffer token starts at
 286  
     * @param len this is the length of the token to be parsed
 287  
     */
 288  
    protected abstract void parse(char[] text, int off, int len);
 289  
    
 290  
    /**
 291  
     * This is used to commit the provided text in to the style that
 292  
     * is required. Committing the text to the buffer assembles the
 293  
     * tokens resulting in a complete token.
 294  
     * 
 295  
     * @param text this is the text buffer to acquire the token from
 296  
     * @param off this is the offset in the buffer token starts at
 297  
     * @param len this is the length of the token to be committed
 298  
     */
 299  
    protected abstract void commit(char[] text, int off, int len);
 300  
 }