utf16string/
iters.rs

1//! Implementation for the various char iterators.
2//!
3//! The type itself lives in the lib.rs file to avoid having to have a public alias, but
4//! implementations live here.
5
6use byteorder::ByteOrder;
7
8use std::iter::FusedIterator;
9
10use crate::utf16::{decode_surrogates, is_leading_surrogate, is_trailing_surrogate, Utf16CharExt};
11use crate::{WStrCharIndices, WStrChars};
12
13impl<'a, E> Iterator for WStrChars<'a, E>
14where
15    E: ByteOrder,
16{
17    type Item = char;
18
19    #[inline]
20    fn next(&mut self) -> Option<Self::Item> {
21        // Our input is valid UTF-16, so we can take a lot of shortcuts.
22        let chunk = self.chunks.next()?;
23        let u = E::read_u16(chunk);
24
25        if !is_leading_surrogate(u) {
26            // SAFETY: This is now guaranteed a valid Unicode code point.
27            Some(unsafe { std::char::from_u32_unchecked(u as u32) })
28        } else {
29            let chunk = self.chunks.next().expect("missing trailing surrogate");
30            let u2 = E::read_u16(chunk);
31            debug_assert!(
32                is_trailing_surrogate(u2),
33                "code unit not a trailing surrogate"
34            );
35            Some(unsafe { decode_surrogates(u, u2) })
36        }
37    }
38
39    #[inline]
40    fn count(self) -> usize {
41        // No need to fully construct all characters
42        self.chunks
43            .filter(|bb| !is_trailing_surrogate(E::read_u16(bb)))
44            .count()
45    }
46
47    #[inline]
48    fn last(mut self) -> Option<Self::Item> {
49        self.next_back()
50    }
51}
52
53impl<'a, E> FusedIterator for WStrChars<'a, E> where E: ByteOrder {}
54
55impl<'a, E> DoubleEndedIterator for WStrChars<'a, E>
56where
57    E: ByteOrder,
58{
59    #[inline]
60    fn next_back(&mut self) -> Option<Self::Item> {
61        // Our input is valid UTF-16, so we can take a lot of shortcuts.
62        let chunk = self.chunks.next_back()?;
63        let u = E::read_u16(chunk);
64
65        if !is_trailing_surrogate(u) {
66            // SAFETY: This is now guaranteed a valid Unicode code point.
67            Some(unsafe { std::char::from_u32_unchecked(u as u32) })
68        } else {
69            let chunk = self.chunks.next_back().expect("missing leading surrogate");
70            let u2 = E::read_u16(chunk);
71            debug_assert!(
72                is_leading_surrogate(u2),
73                "code unit not a leading surrogate"
74            );
75            Some(unsafe { decode_surrogates(u2, u) })
76        }
77    }
78}
79
80impl<'a, E> Iterator for WStrCharIndices<'a, E>
81where
82    E: ByteOrder,
83{
84    type Item = (usize, char);
85
86    #[inline]
87    fn next(&mut self) -> Option<Self::Item> {
88        let pos = self.index;
89        let c = self.chars.next()?;
90        self.index += c.encoded_utf16_len();
91        Some((pos, c))
92    }
93
94    #[inline]
95    fn count(self) -> usize {
96        // No need to fully construct all characters
97        self.chars.count()
98    }
99
100    #[inline]
101    fn last(mut self) -> Option<Self::Item> {
102        self.next_back()
103    }
104}
105
106impl<'a, E> DoubleEndedIterator for WStrCharIndices<'a, E>
107where
108    E: ByteOrder,
109{
110    #[inline]
111    fn next_back(&mut self) -> Option<Self::Item> {
112        let c = self.chars.next_back()?;
113        let pos = self.index + self.chars.chunks.len() * std::mem::size_of::<u16>();
114        Some((pos, c))
115    }
116}
117
118impl<'a, E> FusedIterator for WStrCharIndices<'a, E> where E: ByteOrder {}
119
120#[cfg(test)]
121mod tests {
122    use crate::WStr;
123
124    #[test]
125    fn test_wstr_chars() {
126        let b = b"h\x00e\x00l\x00l\x00o\x00";
127        let s = WStr::from_utf16le(b).unwrap();
128        let chars: Vec<char> = s.chars().collect();
129        assert_eq!(chars, vec!['h', 'e', 'l', 'l', 'o']);
130
131        let b = b"\x00\xd8\x00\xdcx\x00";
132        let s = WStr::from_utf16le(b).unwrap();
133        let chars: Vec<char> = s.chars().collect();
134        assert_eq!(chars, vec!['\u{10000}', 'x']);
135
136        // Regression: this leading surrogate used to be badly detected.
137        let b = b"\x41\xf8A\x00";
138        let s = WStr::from_utf16le(b).unwrap();
139        let chars: Vec<char> = s.chars().collect();
140        assert_eq!(chars, vec!['\u{f841}', 'A']);
141    }
142
143    #[test]
144    fn test_wstr_chars_reverse() {
145        let b = b"h\x00e\x00l\x00l\x00o\x00";
146        let s = WStr::from_utf16le(b).unwrap();
147        let chars: Vec<char> = s.chars().rev().collect();
148        assert_eq!(chars, vec!['o', 'l', 'l', 'e', 'h']);
149
150        let b = b"\x00\xd8\x00\xdcx\x00";
151        let s = WStr::from_utf16le(b).unwrap();
152        let chars: Vec<char> = s.chars().rev().collect();
153        assert_eq!(chars, vec!['x', '\u{10000}']);
154    }
155
156    #[test]
157    fn test_wstr_chars_last() {
158        let b = b"h\x00e\x00l\x00l\x00o\x00";
159        let s = WStr::from_utf16le(b).unwrap();
160        let c = s.chars().last().unwrap();
161        assert_eq!(c, 'o');
162
163        let b = b"\x00\xd8\x00\xdcx\x00";
164        let s = WStr::from_utf16le(b).unwrap();
165        let c = s.chars().last().unwrap();
166        assert_eq!(c, 'x');
167    }
168
169    #[test]
170    fn test_wstr_chars_count() {
171        let b = b"h\x00e\x00l\x00l\x00o\x00";
172        let s = WStr::from_utf16le(b).unwrap();
173        let n = s.chars().count();
174        assert_eq!(n, 5);
175
176        let b = b"\x00\xd8\x00\xdcx\x00";
177        let s = WStr::from_utf16le(b).unwrap();
178        let n = s.chars().count();
179        assert_eq!(n, 2);
180    }
181
182    #[test]
183    fn test_wstr_char_indices() {
184        let b = b"h\x00e\x00l\x00l\x00o\x00";
185        let s = WStr::from_utf16le(b).unwrap();
186        let chars: Vec<(usize, char)> = s.char_indices().collect();
187        assert_eq!(
188            chars,
189            vec![(0, 'h'), (2, 'e'), (4, 'l'), (6, 'l'), (8, 'o')]
190        );
191
192        let b = b"\x00\xd8\x00\xdcx\x00";
193        let s = WStr::from_utf16le(b).unwrap();
194        let chars: Vec<(usize, char)> = s.char_indices().collect();
195        assert_eq!(chars, vec![(0, '\u{10000}'), (4, 'x')]);
196    }
197
198    #[test]
199    fn test_wstr_char_indices_reverse() {
200        let b = b"h\x00e\x00l\x00l\x00o\x00";
201        let s = WStr::from_utf16le(b).unwrap();
202        let chars: Vec<(usize, char)> = s.char_indices().rev().collect();
203        assert_eq!(
204            chars,
205            vec![(8, 'o'), (6, 'l'), (4, 'l'), (2, 'e'), (0, 'h')]
206        );
207
208        let b = b"\x00\xd8\x00\xdcx\x00";
209        let s = WStr::from_utf16le(b).unwrap();
210        let chars: Vec<(usize, char)> = s.char_indices().rev().collect();
211        assert_eq!(chars, vec![(4, 'x'), (0, '\u{10000}')]);
212    }
213
214    #[test]
215    fn test_wstr_char_indices_last() {
216        let b = b"h\x00e\x00l\x00l\x00o\x00";
217        let s = WStr::from_utf16le(b).unwrap();
218        let c = s.char_indices().last().unwrap();
219        assert_eq!(c, (8, 'o'));
220
221        let b = b"\x00\xd8\x00\xdcx\x00";
222        let s = WStr::from_utf16le(b).unwrap();
223        let c = s.char_indices().last().unwrap();
224        assert_eq!(c, (4, 'x'));
225    }
226
227    #[test]
228    fn test_wstr_char_indices_count() {
229        let b = b"h\x00e\x00l\x00l\x00o\x00";
230        let s = WStr::from_utf16le(b).unwrap();
231        let n = s.char_indices().count();
232        assert_eq!(n, 5);
233
234        let b = b"\x00\xd8\x00\xdcx\x00";
235        let s = WStr::from_utf16le(b).unwrap();
236        let n = s.char_indices().count();
237        assert_eq!(n, 2);
238    }
239}