View Javadoc
1   package org.sentrysoftware.jawk.jrt;
2   
3   /*-
4    * ╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲
5    * Jawk
6    * ჻჻჻჻჻჻
7    * Copyright (C) 2006 - 2023 Sentry Software
8    * ჻჻჻჻჻჻
9    * This program is free software: you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser General Public License as
11   * published by the Free Software Foundation, either version 3 of the
12   * License, or (at your option) any later version.
13   *
14   * This program is distributed in the hope that it will be useful,
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU General Lesser Public License for more details.
18   *
19   * You should have received a copy of the GNU General Lesser Public
20   * License along with this program.  If not, see
21   * <http://www.gnu.org/licenses/lgpl-3.0.html>.
22   * ╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱╲╱
23   */
24  
25  import java.io.FilterReader;
26  import java.io.IOException;
27  import java.io.Reader;
28  import java.util.regex.Matcher;
29  import java.util.regex.Pattern;
30  
31  /**
32   * A reader which consumes one record at a time from
33   * an underlying input reader.
34   *
35   * <h2>Greedy Regex Matching</h2>
36   * The current implementation matches setRecordSeparator against
37   * contents of an input buffer (the underlying input
38   * stream filling the input buffer). Records are
39   * split against the matched regular expression
40   * input, treating the regular expression as a
41   * record separator.
42   *
43   * <p>
44   * By default, greedy regular expression matching
45   * for setRecordSeparator is turned off. It is assumed
46   * the user will employ a non-ambiguous regex for setRecordSeparator.
47   * For example, ab*c is a non-ambiguous regex,
48   * but ab?c?b is an ambiguous regex because
49   * it can match ab or abc, and the reader may
50   * accept either one, depending on input buffer boundaries.
51   * The implemented way to employ greedy regex matching
52   * is to consume subsequent input until the match
53   * does not occur at the end of the input buffer,
54   * or no input is available. However, this behavior
55   * is not desirable in all cases (i.e., interactive
56   * input against some sort of ambiguous newline
57   * regex). To enable greedy setRecordSeparator regex consumption,
58   * use <code>-Djawk.forceGreedyRS=true</code>.
59   *
60   * @author Danny Daglas
61   */
62  public class PartitioningReader extends FilterReader {
63  
64  	private static final boolean FORCE_GREEDY_RS;
65  
66  	static {
67  		String grs = System.getProperty("jawk.forceGreedyRS", "0").trim();
68  		FORCE_GREEDY_RS = grs.equals("1") || grs.equalsIgnoreCase("yes") || grs.equalsIgnoreCase("true");
69  	}
70  	private Pattern rs;
71  	private Matcher matcher;
72  	private boolean fromFileNameList;
73  
74  	/**
75  	 * Construct the partitioning reader.
76  	 *
77  	 * @param reader The reader containing the input data stream.
78  	 * @param recordSeparator The record separator, as a regular expression.
79  	 */
80  	public PartitioningReader(Reader reader, String recordSeparator) {
81  		this(reader, recordSeparator, false);
82  	}
83  
84  	/**
85  	 * Construct the partitioning reader.
86  	 *
87  	 * @param r The reader containing the input data stream.
88  	 * @param recordSeparator The record separator, as a regular expression.
89  	 * @param fromFileNameList Whether the underlying input reader
90  	 *   is a file from the filename list (the parameters passed
91  	 *   into AWK after the script argument).
92  	 */
93  	public PartitioningReader(Reader r, String recordSeparator, boolean fromFileNameList) {
94  		super(r);
95  		this.fromFileNameList = fromFileNameList;
96  		setRecordSeparator(recordSeparator);
97  	}
98  	private String recordSeparator = null;
99  	private boolean consumeAll = false;
100 
101 	/**
102 	 * Assign a new record separator for this partitioning reader.
103 	 *
104 	 * @param recordSeparator The new record separator, as a regular expression.
105 	 */
106 	public final void setRecordSeparator(String recordSeparator) {
107 		if (!recordSeparator.equals(this.recordSeparator)) {
108 			if ("".equals(recordSeparator)) {
109 				consumeAll = true;
110 				rs = Pattern.compile("\\z", Pattern.DOTALL | Pattern.MULTILINE);
111 			} else if ("\n".equals(recordSeparator) || "\r\n".equals(recordSeparator) || "\r".equals(recordSeparator)) {
112 				// For performance reason, handle the default RS in a specific way here
113 				consumeAll = false;
114 				rs = Pattern.compile(recordSeparator, Pattern.LITERAL);
115 			} else {
116 				consumeAll = false;
117 				rs = Pattern.compile(recordSeparator, Pattern.DOTALL | Pattern.MULTILINE);
118 			}
119 			this.recordSeparator = recordSeparator;
120 		}
121 	}
122 
123 	/**
124 	 * <p>fromFilenameList.</p>
125 	 *
126 	 * @return true whether the underlying input reader is from a
127 	 *	filename list argument; false otherwise
128 	 */
129 	public boolean fromFilenameList() {
130 		return fromFileNameList;
131 	}
132 
133 	private StringBuilder remaining = new StringBuilder();
134 	private char[] readBuffer = new char[4096];
135 
136 	/** {@inheritDoc} */
137 	@Override
138 	public int read(char[] b, int start, int len) throws IOException {
139 		int readChars = super.read(b, start, len);
140 		if (readChars >= 0) {
141 			remaining.append(b, start, readChars);
142 		}
143 		return readChars;
144 	}
145 
146 	private boolean eof = false;
147 
148 	/**
149 	 * Consume one record from the reader.
150 	 * It uses the record separator regular
151 	 * expression to mark start/end of records.
152 	 *
153 	 * @return the next record, null if no more records exist
154 	 * @throws java.io.IOException upon an IO error
155 	 */
156 	public String readRecord() throws IOException {
157 
158 		if (matcher == null) {
159 			matcher = rs.matcher(remaining);
160 		} else {
161 			matcher.reset(remaining);
162 		}
163 
164 		while (consumeAll || eof || remaining.length() == 0 || !matcher.find()) {
165 			int len = read(readBuffer, 0, readBuffer.length);
166 			if (eof || (len < 0)) {
167 				eof = true;
168 				String retVal = remaining.toString();
169 				remaining.setLength(0);
170 				if (retVal.length() == 0) {
171 					return null;
172 				} else {
173 					return retVal;
174 				}
175 			} else if (len == 0) {
176 				throw new RuntimeException("len == 0 ?!");
177 			}
178 			matcher = rs.matcher(remaining);
179 		}
180 
181 		// if force greedy regex consumption:
182 		if (FORCE_GREEDY_RS) {
183 			// attempt to move last match away from the end of the input
184 			// so that buffer bounderies landing in the middle of
185 			// regexp matches that *could* match the regexp if more chars
186 			// were read
187 			// (one char at a time!)
188 			while (matcher.find() && matcher.end() == remaining.length() && matcher.requireEnd()) {
189 				if (read(readBuffer, 0, 1) >= 0) {
190 					matcher = rs.matcher(remaining);
191 				} else {
192 					break;
193 				}
194 			}
195 		}
196 
197 		// we have a record separator!
198 
199 		String retVal = remaining.substring(0, matcher.start());
200 		remaining.delete(0, matcher.end());
201 		return retVal;
202 	}
203 }