1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
|
From 2482e34ac12910f7e7436a968caf97ab96f8b582 Mon Sep 17 00:00:00 2001
From: Kerin Millar <kfm@plushkava.net>
Date: Sat, 26 Apr 2025 08:43:58 +0100
Subject: [PATCH 1/3] Backport fix for invalid continuation bytes being ignored
as delimiters
This is a partial backport of commit 772e7e760e8a098e4d8dee21cf11090be4757918
from the devel branch. It addresses an issue in read_mbchar() whereby
the read builtin can read past the delimiter character, provided that is
invoked with a multibyte character set in effect. Consider the following
test case.
$ LC_ALL=en_US.UTF-8
$ for i in {194..245}; do printf -v o %o "$i"; printf "\\$o\\n"; done |
while read -r; do declare -p REPLY; done
declare -- REPLY=$'\302\n\303\n\304\n\305\n\306\n\307\n\310\n\311\n\312\
n\313\n\314\n\315\n\316\n\317\n\320\n\321\n\322\n\323\n\324\n\325\n\326\
n\327\n\330\n\331\n\332\n\333\n\334\n\335\n\336\n\337\n\340\n\341\n\342\
n\343\n\344\n\345\n\346\n\347\n\350\n\351\n\352\n\353\n\354\n\355\n\356\
n\357\n\360\n\361\n\362\n\363\n\364\n\365'
The producing loop emits a sequence of bytes in the range 0xC2 - 0xF5.
Since each is terminated by a <newline> character, one would expect for
exactly 52 iterations of the consuming loop, with REPLY being assigned a
single byte each time. Instead, the input is read in its entirety. Why
is that, one may ask.
Given a legal UTF-8 byte sequence, any bytes whose values are between
0xC2 - 0xF4 are combinative in nature; they can only be followed by
between one and three bytes that are outside of that range.
0xC2 - 0xDF : First byte of a 2-byte code unit sequence
0xE0 - 0xEF : First byte of a 3-byte code unit sequence
0xF0 - 0xF4 : First byte of a 4-byte code unit sequence
As such, bash begins by reading the 0xC2 byte, for which mbrtowc(3)
returns -2, indicating an incomplete multibyte sequence. Next, the 0x0A
byte is read, for which mbrtowc(3) returns -1, indicating an invalid
multibyte sequence. At this point, bash ought to recognise the most
recently read byte as a delimiter. Instead, it continues reading the
input stream up until the delimiter that follows 0xF5, which is neither
a combining character nor legal in UTF-8 in any capacity.
This patch addresses the issue by introducing the zungetc() function,
which is used by read_mbchar() to push back the delimiter character that
transforms the sequence from an incomplete one to an invalid one. Said
character is then detected by the next invocation of the zread()
function, allowing for the decision to be made to return.
With this, the output of the test case amounts to 52 lines, as expected.
declare -- REPLY=$'\302'
declare -- REPLY=$'\303'
...
declare -- REPLY=$'\364'
declare -- REPLY=$'\365'
The issue affects all bash releases from 5.0 to 5.3-alpha. As of the
time of writing, it has not been addressed by any of the official
patchlevels, nor has 5.3 been released.
Link: https://pubs.opengroup.org/onlinepubs/9799919799.2024edition/utilities/read.html#tag_20_100_06
Link: https://mywiki.wooledge.org/BashPitfalls#IFS.3D_read_-r_-d_.27.27_filename
Link: https://lists.gnu.org/archive/html/bug-bash/2025-04/msg00068.html
Signed-off-by: Kerin Millar <kfm@plushkava.net>
---
builtins/read.def | 25 ++++++++++++----
externs.h | 1 +
lib/sh/zread.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 94 insertions(+), 6 deletions(-)
diff --git builtins/read.def builtins/read.def
index ddd91d32..53b4bd81 100644
--- builtins/read.def
+++ builtins/read.def
@@ -130,7 +130,7 @@ static void set_readline_timeout PARAMS((sh_timer *t, time_t, long));
#endif
static SHELL_VAR *bind_read_variable PARAMS((char *, char *, int));
#if defined (HANDLE_MULTIBYTE)
-static int read_mbchar PARAMS((int, char *, int, int, int));
+static int read_mbchar PARAMS((int, char *, int, int, int, int));
#endif
static void ttyrestore PARAMS((struct ttsave *));
@@ -806,7 +806,7 @@ add_char:
else
# endif
if (locale_utf8locale == 0 || ((c & 0x80) != 0))
- i += read_mbchar (fd, input_string, i, c, unbuffered_read);
+ i += read_mbchar (fd, input_string, i, c, delim, unbuffered_read);
}
#endif
@@ -1064,10 +1064,10 @@ bind_read_variable (name, value, flags)
#if defined (HANDLE_MULTIBYTE)
static int
-read_mbchar (fd, string, ind, ch, unbuffered)
+read_mbchar (fd, string, ind, ch, delim, unbuffered)
int fd;
char *string;
- int ind, ch, unbuffered;
+ int ind, ch, delim, unbuffered;
{
char mbchar[MB_LEN_MAX + 1];
int i, n, r;
@@ -1101,8 +1101,21 @@ read_mbchar (fd, string, ind, ch, unbuffered)
mbchar[i++] = c;
continue;
}
- else if (ret == (size_t)-1 || ret == (size_t)0 || ret > (size_t)0)
- break;
+ else if (ret == (size_t)-1)
+ {
+ /* If we read a delimiter character that makes this an invalid
+ multibyte character, we can't just add it to the input string
+ and treat it as a byte. We need to push it back so a subsequent
+ zread will pick it up. */
+ if (c == delim)
+ {
+ zungetc (c);
+ mbchar[--i] = '\0'; /* unget the delimiter */
+ }
+ break; /* invalid multibyte character */
+ }
+ else if (ret == (size_t)0 || ret > (size_t)0)
+ break; /* valid multibyte character */
}
mbchar_return:
diff --git externs.h externs.h
index 931dba9c..1b70a13b 100644
--- externs.h
+++ externs.h
@@ -536,6 +536,7 @@ extern ssize_t zreadintr PARAMS((int, char *, size_t));
extern ssize_t zreadc PARAMS((int, char *));
extern ssize_t zreadcintr PARAMS((int, char *));
extern ssize_t zreadn PARAMS((int, char *, size_t));
+extern int zungetc PARAMS((int));
extern void zreset PARAMS((void));
extern void zsyncfd PARAMS((int));
diff --git lib/sh/zread.c lib/sh/zread.c
index dafb7f60..7cfbb288 100644
--- lib/sh/zread.c
+++ lib/sh/zread.c
@@ -41,6 +41,10 @@ extern int errno;
# define ZBUFSIZ 4096
#endif
+#ifndef EOF
+# define EOF -1
+#endif
+
extern int executing_builtin;
extern void check_signals_and_traps (void);
@@ -48,6 +52,11 @@ extern void check_signals (void);
extern int signal_is_trapped (int);
extern int read_builtin_timeout (int);
+int zungetc (int);
+
+/* Provide one character of pushback whether we are using read or zread. */
+static int zpushedchar = -1;
+
/* Read LEN bytes from FD into BUF. Retry the read on EINTR. Any other
error causes the loop to break. */
ssize_t
@@ -59,6 +68,15 @@ zread (fd, buf, len)
ssize_t r;
check_signals (); /* check for signals before a blocking read */
+
+ /* If we pushed a char back, return it immediately */
+ if (zpushedchar != -1)
+ {
+ *buf = (unsigned char)zpushedchar;
+ zpushedchar = -1;
+ return 1;
+ }
+
/* should generalize into a mechanism where different parts of the shell can
`register' timeouts and have them checked here. */
while (((r = read_builtin_timeout (fd)) < 0 || (r = read (fd, buf, len)) < 0) &&
@@ -95,6 +113,14 @@ zreadretry (fd, buf, len)
ssize_t r;
int nintr;
+ /* If we pushed a char back, return it immediately */
+ if (zpushedchar != -1)
+ {
+ *buf = (unsigned char)zpushedchar;
+ zpushedchar = -1;
+ return 1;
+ }
+
for (nintr = 0; ; )
{
r = read (fd, buf, len);
@@ -118,6 +144,15 @@ zreadintr (fd, buf, len)
size_t len;
{
check_signals ();
+
+ /* If we pushed a char back, return it immediately */
+ if (zpushedchar != -1)
+ {
+ *buf = (unsigned char)zpushedchar;
+ zpushedchar = -1;
+ return 1;
+ }
+
return (read (fd, buf, len));
}
@@ -135,6 +170,14 @@ zreadc (fd, cp)
{
ssize_t nr;
+ /* If we pushed a char back, return it immediately */
+ if (zpushedchar != -1 && cp)
+ {
+ *cp = (unsigned char)zpushedchar;
+ zpushedchar = -1;
+ return 1;
+ }
+
if (lind == lused || lused == 0)
{
nr = zread (fd, lbuf, sizeof (lbuf));
@@ -160,6 +203,14 @@ zreadcintr (fd, cp)
{
ssize_t nr;
+ /* If we pushed a char back, return it immediately */
+ if (zpushedchar != -1 && cp)
+ {
+ *cp = (unsigned char)zpushedchar;
+ zpushedchar = -1;
+ return 1;
+ }
+
if (lind == lused || lused == 0)
{
nr = zreadintr (fd, lbuf, sizeof (lbuf));
@@ -186,6 +237,13 @@ zreadn (fd, cp, len)
{
ssize_t nr;
+ if (zpushedchar != -1 && cp)
+ {
+ *cp = zpushedchar;
+ zpushedchar = -1;
+ return 1;
+ }
+
if (lind == lused || lused == 0)
{
if (len > sizeof (lbuf))
@@ -204,6 +262,22 @@ zreadn (fd, cp, len)
return 1;
}
+int
+zungetc (c)
+ int c;
+{
+ if (zpushedchar == -1)
+ {
+ zpushedchar = c;
+ return c;
+ }
+
+ if (c == EOF || lind == 0)
+ return (EOF);
+ lbuf[--lind] = c; /* XXX */
+ return c;
+}
+
void
zreset ()
{
--
2.49.0
|