1 package org.sentrysoftware.jawk.jrt; 2 3 /*- 4 * ╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲ 5 * Jawk 6 * ჻჻჻჻჻჻ 7 * Copyright (C) 2006 - 2023 Sentry Software 8 * ჻჻჻჻჻჻ 9 * This program is free software: you can redistribute it and/or modify 10 * it under the terms of the GNU Lesser General Public License as 11 * published by the Free Software Foundation, either version 3 of the 12 * License, or (at your option) any later version. 13 * 14 * This program is distributed in the hope that it will be useful, 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 * GNU General Lesser Public License for more details. 18 * 19 * You should have received a copy of the GNU General Lesser Public 20 * License along with this program. If not, see 21 * <http://www.gnu.org/licenses/lgpl-3.0.html>. 22 * ╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱ 23 */ 24 25 import java.io.FilterReader; 26 import java.io.IOException; 27 import java.io.Reader; 28 import java.util.regex.Matcher; 29 import java.util.regex.Pattern; 30 31 /** 32 * A reader which consumes one record at a time from 33 * an underlying input reader. 34 * 35 * <h2>Greedy Regex Matching</h2> 36 * The current implementation matches setRecordSeparator against 37 * contents of an input buffer (the underlying input 38 * stream filling the input buffer). Records are 39 * split against the matched regular expression 40 * input, treating the regular expression as a 41 * record separator. 42 * 43 * <p> 44 * By default, greedy regular expression matching 45 * for setRecordSeparator is turned off. It is assumed 46 * the user will employ a non-ambiguous regex for setRecordSeparator. 47 * For example, ab*c is a non-ambiguous regex, 48 * but ab?c?b is an ambiguous regex because 49 * it can match ab or abc, and the reader may 50 * accept either one, depending on input buffer boundaries. 51 * The implemented way to employ greedy regex matching 52 * is to consume subsequent input until the match 53 * does not occur at the end of the input buffer, 54 * or no input is available. However, this behavior 55 * is not desirable in all cases (i.e., interactive 56 * input against some sort of ambiguous newline 57 * regex). To enable greedy setRecordSeparator regex consumption, 58 * use <code>-Djawk.forceGreedyRS=true</code>. 59 * 60 * @author Danny Daglas 61 */ 62 public class PartitioningReader extends FilterReader { 63 64 private static final boolean FORCE_GREEDY_RS; 65 66 static { 67 String grs = System.getProperty("jawk.forceGreedyRS", "0").trim(); 68 FORCE_GREEDY_RS = grs.equals("1") || grs.equalsIgnoreCase("yes") || grs.equalsIgnoreCase("true"); 69 } 70 private Pattern rs; 71 private Matcher matcher; 72 private boolean fromFileNameList; 73 74 /** 75 * Construct the partitioning reader. 76 * 77 * @param reader The reader containing the input data stream. 78 * @param recordSeparator The record separator, as a regular expression. 79 */ 80 public PartitioningReader(Reader reader, String recordSeparator) { 81 this(reader, recordSeparator, false); 82 } 83 84 /** 85 * Construct the partitioning reader. 86 * 87 * @param r The reader containing the input data stream. 88 * @param recordSeparator The record separator, as a regular expression. 89 * @param fromFileNameList Whether the underlying input reader 90 * is a file from the filename list (the parameters passed 91 * into AWK after the script argument). 92 */ 93 public PartitioningReader(Reader r, String recordSeparator, boolean fromFileNameList) { 94 super(r); 95 this.fromFileNameList = fromFileNameList; 96 setRecordSeparator(recordSeparator); 97 } 98 private String recordSeparator = null; 99 private boolean consumeAll = false; 100 101 /** 102 * Assign a new record separator for this partitioning reader. 103 * 104 * @param recordSeparator The new record separator, as a regular expression. 105 */ 106 public final void setRecordSeparator(String recordSeparator) { 107 if (!recordSeparator.equals(this.recordSeparator)) { 108 if ("".equals(recordSeparator)) { 109 consumeAll = true; 110 rs = Pattern.compile("\\z", Pattern.DOTALL | Pattern.MULTILINE); 111 } else if ("\n".equals(recordSeparator) || "\r\n".equals(recordSeparator) || "\r".equals(recordSeparator)) { 112 // For performance reason, handle the default RS in a specific way here 113 consumeAll = false; 114 rs = Pattern.compile(recordSeparator, Pattern.LITERAL); 115 } else { 116 consumeAll = false; 117 rs = Pattern.compile(recordSeparator, Pattern.DOTALL | Pattern.MULTILINE); 118 } 119 this.recordSeparator = recordSeparator; 120 } 121 } 122 123 /** 124 * <p>fromFilenameList.</p> 125 * 126 * @return true whether the underlying input reader is from a 127 * filename list argument; false otherwise 128 */ 129 public boolean fromFilenameList() { 130 return fromFileNameList; 131 } 132 133 private StringBuilder remaining = new StringBuilder(); 134 private char[] readBuffer = new char[4096]; 135 136 /** {@inheritDoc} */ 137 @Override 138 public int read(char[] b, int start, int len) throws IOException { 139 int readChars = super.read(b, start, len); 140 if (readChars >= 0) { 141 remaining.append(b, start, readChars); 142 } 143 return readChars; 144 } 145 146 private boolean eof = false; 147 148 /** 149 * Consume one record from the reader. 150 * It uses the record separator regular 151 * expression to mark start/end of records. 152 * 153 * @return the next record, null if no more records exist 154 * @throws java.io.IOException upon an IO error 155 */ 156 public String readRecord() throws IOException { 157 158 if (matcher == null) { 159 matcher = rs.matcher(remaining); 160 } else { 161 matcher.reset(remaining); 162 } 163 164 while (consumeAll || eof || remaining.length() == 0 || !matcher.find()) { 165 int len = read(readBuffer, 0, readBuffer.length); 166 if (eof || (len < 0)) { 167 eof = true; 168 String retVal = remaining.toString(); 169 remaining.setLength(0); 170 if (retVal.length() == 0) { 171 return null; 172 } else { 173 return retVal; 174 } 175 } else if (len == 0) { 176 throw new RuntimeException("len == 0 ?!"); 177 } 178 matcher = rs.matcher(remaining); 179 } 180 181 // if force greedy regex consumption: 182 if (FORCE_GREEDY_RS) { 183 // attempt to move last match away from the end of the input 184 // so that buffer bounderies landing in the middle of 185 // regexp matches that *could* match the regexp if more chars 186 // were read 187 // (one char at a time!) 188 while (matcher.find() && matcher.end() == remaining.length() && matcher.requireEnd()) { 189 if (read(readBuffer, 0, 1) >= 0) { 190 matcher = rs.matcher(remaining); 191 } else { 192 break; 193 } 194 } 195 } 196 197 // we have a record separator! 198 199 String retVal = remaining.substring(0, matcher.start()); 200 remaining.delete(0, matcher.end()); 201 return retVal; 202 } 203 }