1
use core::convert::Infallible;
2
use core::marker::PhantomData;
3

            
4
use crate::anystr::AnyStr;
5
use crate::string::{
6
    merge_surrogate_pair, StringContents, HEX_OFFSET_TABLE, HIGH_SURROGATE_MAX, HIGH_SURROGATE_MIN,
7
    LOW_SURROGATE_MAX, LOW_SURROGATE_MIN, SAFE_STRING_BYTES, SAFE_STRING_BYTES_VERIFY_UTF8,
8
};
9
use crate::{Error, ErrorKind, JsonNumber, JsonString, JsonStringInfo};
10

            
11
/// A JSON Token.
12
10502
#[derive(Debug, Clone, PartialEq, Eq)]
13
pub enum Token<'a> {
14
    /// The `null` keyword.
15
    Null,
16
    /// A boolean literal.
17
    Bool(bool),
18
    /// A string literal.
19
    String(JsonString<'a>),
20
    /// A numeric literal.
21
    Number(JsonNumber<'a>),
22
    /// The beginning of an object (`{`).
23
    Object,
24
    /// The end of an object (`}`).
25
    ObjectEnd,
26
    /// The beginning of an array (`[`).
27
    Array,
28
    /// The end of an array (`]`).
29
    ArrayEnd,
30
    /// A colon (`:`), delimiting a key-value pair in an object.
31
    Colon,
32
    /// A comma (`,`), delimiting a list of values or key-value pairs.
33
    Comma,
34
}
35

            
36
/// The (likely) kind of the next JSON token
37
///
38
/// Tries to guess the kind of token present, based on the next character in the token stream
39
7541
#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)]
40
pub enum PeekableTokenKind {
41
    /// `null` keyword
42
    ///
43
    /// Corresponds to [`Token::Null`]
44
    Null,
45
    /// `true` keyword
46
    ///
47
    /// Corresponds to [`Token::Bool`]
48
    True,
49
    /// `false` keyword
50
    ///
51
    /// Corresponds to [`Token::Bool`]
52
    False,
53
    /// A string literal
54
    ///
55
    /// Corresponds to [`Token::String`]
56
    String,
57
    /// A numeric literal
58
    ///
59
    /// Corresponds to [`Token::Number`]
60
    Number,
61
    /// The beginning of an object (`{`)
62
    ///
63
    /// Corresponds to [`Token::Object`]
64
    Object,
65
    /// The end of an object (`{`)
66
    ///
67
    /// Corresponds to [`Token::ObjectEnd`]
68
    ObjectEnd,
69
    /// The beginning of an array (`[`)
70
    ///
71
    /// Corresponds to [`Token::Array`]
72
    Array,
73
    /// The end of an array (`]`)
74
    ///
75
    /// Corresponds to [`Token::ArrayEnd`]
76
    ArrayEnd,
77
    /// Unable to determine the specific token.
78
    ///
79
    /// This kind suggests that the next token is likely malformed, but does not guarantee it.
80
    Unrecognized,
81
    /// A colon (`:`), delimiting a key-value pair in an object.
82
    ///
83
    /// Corresponds to [`Token::Colon`]
84
    Colon,
85
    /// A comma (`,`), delimiting a list of values or key-value pairs.
86
    ///
87
    /// Corresponds to [`Token::Comma`]
88
    Comma,
89
}
90

            
91
/// A JSON tokenizer, which converts JSON source to a series of [`Token`]s.
92
pub struct Tokenizer<'a, const GUARANTEED_UTF8: bool> {
93
    source: ByteIterator<'a>,
94
}
95

            
96
impl<'a> Tokenizer<'a, true> {
97
    /// Returns a new tokenizer for the JSON `source` provided.
98
    ///
99
    /// Because the `str` type guarantees that `json` is valid UTF-8, no
100
    /// additional unicode checks are performed on unescaped unicode sequences.
101
    #[must_use]
102
3
    pub fn for_json(source: &'a str) -> Self {
103
3
        Self::new(source.as_bytes())
104
3
    }
105
}
106

            
107
impl<'a> Tokenizer<'a, false> {
108
    /// Returns a new tokenizer for the JSON `source` bytes provided.
109
    ///
110
    /// This function verifies that `json` is valid UTF-8 while parsing the
111
    /// JSON.
112
    #[must_use]
113
1
    pub fn for_json_bytes(source: &'a [u8]) -> Self {
114
1
        Self::new(source)
115
1
    }
116
}
117

            
118
impl<'a, const GUARANTEED_UTF8: bool> Tokenizer<'a, GUARANTEED_UTF8> {
119
    #[must_use]
120
    #[inline]
121
3849
    fn new(source: &'a [u8]) -> Self {
122
3849
        Self {
123
3849
            source: ByteIterator::new(source),
124
3849
        }
125
3849
    }
126

            
127
    /// Returns the next token, or an [`ErrorKind::UnexpectedEof`].
128
    ///
129
    /// This is functionally identical to this type's [`Iterator`]
130
    /// implementation, except that this function returns an error instead of
131
    /// None when no more tokens remain.
132
    #[inline]
133
25993
    pub fn next_or_eof(&mut self) -> Result<Token<'a>, Error> {
134
25993
        let (offset, byte) = self.source.read_non_ws().map_err(Error::into_fallable)?;
135
25808
        self.read_peek(offset, byte)
136
25993
    }
137

            
138
    #[inline]
139
25823
    fn read_peek(&mut self, offset: usize, first: &'a u8) -> Result<Token<'a>, Error> {
140
25823
        match first {
141
1548
            b'{' => Ok(Token::Object),
142
664
            b'}' => Ok(Token::ObjectEnd),
143
6103
            b'[' => Ok(Token::Array),
144
1781
            b']' => Ok(Token::ArrayEnd),
145
3419
            b',' => Ok(Token::Comma),
146
2885
            b':' => Ok(Token::Colon),
147
5445
            b'"' => self.read_string_from_source(offset).map(Token::String),
148
299
            b'-' => self
149
299
                .read_number_from_source(InitialNumberState::Sign, offset)
150
299
                .map(Token::Number),
151
492
            b'0' => self
152
492
                .read_number_from_source(InitialNumberState::Zero, offset)
153
492
                .map(Token::Number),
154
2160
            b'1'..=b'9' => self
155
1794
                .read_number_from_source(InitialNumberState::Digit, offset)
156
1794
                .map(Token::Number),
157
386
            b't' => self
158
386
                .read_literal_from_source(b"rue")
159
386
                .map(|()| Token::Bool(true)),
160
189
            b'f' => self
161
189
                .read_literal_from_source(b"alse")
162
189
                .map(|()| Token::Bool(false)),
163
259
            b'n' => self.read_literal_from_source(b"ull").map(|()| Token::Null),
164
559
            _ => Err(Error {
165
559
                offset,
166
559
                kind: ErrorKind::Unexpected(*first),
167
559
            }),
168
        }
169
25823
    }
170

            
171
    #[inline]
172
    #[allow(unsafe_code)]
173
5445
    fn read_string_from_source(&mut self, start: usize) -> Result<JsonString<'a>, Error> {
174
5445
        let safe_strings = if GUARANTEED_UTF8 {
175
2001
            SAFE_STRING_BYTES
176
        } else {
177
3444
            SAFE_STRING_BYTES_VERIFY_UTF8
178
        };
179
5445
        let mut string_info = JsonStringInfo::NONE;
180

            
181
        loop {
182
53504
            let (offset, byte) = self.source.next().ok_or(Error {
183
53504
                offset: self.source.offset,
184
53504
                kind: ErrorKind::UnclosedString,
185
53504
            })?;
186
53461
            if safe_strings[usize::from(*byte)] {
187
46397
                string_info.add_bytes(1);
188
46397
            } else {
189
7064
                match byte {
190
                    b'"' => {
191
4922
                        break Ok(JsonString {
192
4922
                            source: StringContents::Json(
193
4922
                                // SAFETY: the UTF8 has been manually verified by the parser.
194
4922
                                AnyStr::Borrowed(unsafe {
195
4922
                                    core::str::from_utf8_unchecked(
196
4922
                                        &self.source.bytes[start + 1..offset],
197
4922
                                    )
198
4922
                                }),
199
4922
                            ),
200
4922
                            info: string_info,
201
4922
                        });
202
                    }
203
1930
                    b'\\' => self.read_string_escape(&mut string_info)?,
204
115
                    128..=255 => {
205
                        // Manual UTF-8 validation
206
115
                        let utf8_start = offset;
207
410
                        while let Some(byte) = self.source.peek() {
208
410
                            if byte < &128 {
209
115
                                break;
210
295
                            }
211
295

            
212
295
                            self.source.next();
213
                        }
214

            
215
115
                        let unicode_end = self.source.offset;
216
115
                        string_info.add_bytes(unicode_end - utf8_start);
217
115
                        if core::str::from_utf8(&self.source.bytes[utf8_start..unicode_end])
218
115
                            .is_err()
219
                        {
220
                            // The offset on this is incorrect.
221
67
                            return Err(Error {
222
67
                                offset,
223
67
                                kind: ErrorKind::Utf8,
224
67
                            });
225
48
                        }
226
                    }
227
97
                    0..=31 => {
228
97
                        return Err(Error {
229
97
                            offset,
230
97
                            kind: ErrorKind::Unexpected(*byte),
231
97
                        })
232
                    }
233
                    b' '..=127 => {
234
                        unreachable!("safe_strings filters these values")
235
                    }
236
                }
237
            }
238
        }
239
5445
    }
240

            
241
    #[allow(unsafe_code)]
242
    #[inline]
243
    fn read_string_escape(&mut self, string_info: &mut JsonStringInfo) -> Result<(), Error> {
244
1930
        match self.source.read()? {
245
842
            (_, b'"' | b'\\' | b'/' | b'b' | b'f' | b'r' | b'n' | b't') => {
246
842
                string_info.add_bytes_from_escape(1);
247
842
            }
248
889
            (offset, b'u') => {
249
889
                // 4 hexadecimal digits.
250
889
                let mut decoded = 0_u32;
251
4387
                for _ in 0..4 {
252
3517
                    let (offset, digit) = self.source.read()?;
253
3517
                    let nibble = HEX_OFFSET_TABLE[usize::from(*digit)];
254
3517
                    if nibble == u8::MAX {
255
19
                        return Err(Error {
256
19
                            offset,
257
19
                            kind: ErrorKind::InvalidHexadecimal,
258
19
                        });
259
3498
                    }
260
3498
                    decoded = decoded << 4 | u32::from(nibble);
261
                }
262

            
263
870
                let ch = if let Some(ch) = char::from_u32(decoded) {
264
724
                    ch
265
                } else {
266
                    // We either have an invalid codepoint or a partial
267
                    // surrogate.
268
146
                    let mut decoded_is_surrogate_pair = false;
269
146
                    if (HIGH_SURROGATE_MIN..=HIGH_SURROGATE_MAX).contains(&decoded) {
270
                        // We have a potential surrogate pair. Try to read another \u escape code
271
120
                        if self.source.read()?.1 == &b'\\' && self.source.read()?.1 == &b'u' {
272
90
                            let mut second_codepoint = 0;
273
378
                            for _ in 0..4 {
274
312
                                let (offset, digit) = self.source.read()?;
275
312
                                let nibble = HEX_OFFSET_TABLE[usize::from(*digit)];
276
312
                                if nibble == u8::MAX {
277
24
                                    return Err(Error {
278
24
                                        offset,
279
24
                                        kind: ErrorKind::InvalidHexadecimal,
280
24
                                    });
281
288
                                }
282
288
                                second_codepoint = second_codepoint << 4 | u32::from(nibble);
283
                            }
284
66
                            if (LOW_SURROGATE_MIN..=LOW_SURROGATE_MAX).contains(&second_codepoint) {
285
48
                                // We have a valid surrogate pair
286
48
                                decoded = merge_surrogate_pair(decoded, second_codepoint);
287
48
                                decoded_is_surrogate_pair = true;
288
48
                            }
289
30
                        }
290
26
                    }
291

            
292
122
                    if decoded_is_surrogate_pair {
293
                        // SAFETY: we have manually marged the surrogate pair
294
                        // into a single valid codepoint, and this cannot fail.
295
48
                        unsafe { char::from_u32_unchecked(decoded) }
296
                    } else {
297
74
                        return Err(Error {
298
74
                            offset,
299
74
                            kind: ErrorKind::Utf8,
300
74
                        });
301
                    }
302
                };
303

            
304
772
                string_info.add_bytes_from_escape(ch.len_utf8());
305
            }
306
193
            (offset, _) => {
307
193
                return Err(Error {
308
193
                    offset,
309
193
                    kind: ErrorKind::InvalidEscape,
310
193
                })
311
            }
312
        }
313
1614
        Ok(())
314
1930
    }
315

            
316
    #[allow(unsafe_code)]
317
    #[inline]
318
2585
    fn read_number_from_source(
319
2585
        &mut self,
320
2585
        initial_state: InitialNumberState,
321
2585
        start: usize,
322
2585
    ) -> Result<JsonNumber<'a>, Error> {
323
2585
        // Numbers are the "hardest" in that we have to peek the digits since
324
2585
        // there is no terminal character. Every other type in JSON has a way to
325
2585
        // know when the type ends.
326
2585

            
327
2585
        // First, if the number began with a sign, we must read an integer
328
2585
        // digit. The JSON spec disallows numbers with leading 0s. If the first
329
2585
        // digit is a 0, it must be a decimal with a 0 integer value.
330
2585
        if initial_state != InitialNumberState::Zero {
331
2093
            let mut read_integer_digit = initial_state == InitialNumberState::Digit;
332
5366
            while let Some(byte) = self.source.peek() {
333
354
                match byte {
334
48
                    b'0' if !read_integer_digit => {
335
48
                        // 0 after a sign still counts as the complete Integer
336
48
                        // part.
337
48
                        self.source.read()?;
338
48
                        read_integer_digit = true;
339
48
                        break;
340
                    }
341
3966
                    b'0'..=b'9' => {
342
3273
                        self.source.read()?;
343
3273
                        read_integer_digit = true;
344
                    }
345
1975
                    _ => break,
346
                }
347
            }
348

            
349
2093
            if !read_integer_digit {
350
38
                return Err(Error {
351
38
                    offset: self.source.offset,
352
38
                    kind: ErrorKind::ExpectedDigit,
353
38
                });
354
2055
            }
355
492
        }
356

            
357
        // If the next character is a period, this is a floating point literal.
358
2547
        if let Some(b'.') = self.source.peek() {
359
552
            self.source.next();
360
552

            
361
552
            // Read one or more decimal digits
362
552
            let mut read_decimal_digit = false;
363
3070
            while let Some(byte) = self.source.peek() {
364
3061
                if byte.is_ascii_digit() {
365
2518
                    self.source.next();
366
2518
                    read_decimal_digit = true;
367
2518
                } else {
368
543
                    break;
369
                }
370
            }
371

            
372
552
            if !read_decimal_digit {
373
44
                return Err(Error {
374
44
                    offset: self.source.offset,
375
44
                    kind: ErrorKind::ExpectedDecimalDigit,
376
44
                });
377
508
            }
378
1995
        }
379

            
380
        // Next, we might have an exponent
381
2503
        if let Some(b'e' | b'E') = self.source.peek() {
382
698
            self.source.next();
383
698

            
384
698
            // Next, we might have a sign
385
698
            if let Some(b'-' | b'+') = self.source.peek() {
386
394
                self.source.next();
387
394
            }
388

            
389
            // Read one or more exponent digits
390
698
            let mut read_exponent_digit = false;
391
2502
            while let Some(byte) = self.source.peek() {
392
2496
                if byte.is_ascii_digit() {
393
1804
                    self.source.next();
394
1804
                    read_exponent_digit = true;
395
1804
                } else {
396
692
                    break;
397
                }
398
            }
399

            
400
698
            if !read_exponent_digit {
401
190
                return Err(Error {
402
190
                    offset: self.source.offset,
403
190
                    kind: ErrorKind::ExpectedExponent,
404
190
                });
405
508
            }
406
1805
        }
407

            
408
2313
        Ok(JsonNumber {
409
2313
            // SAFETY: To reach this point, we can only have read ascii
410
2313
            // characters.
411
2313
            source: AnyStr::Borrowed(unsafe {
412
2313
                core::str::from_utf8_unchecked(&self.source.bytes[start..self.source.offset])
413
2313
            }),
414
2313
        })
415
2585
    }
416

            
417
    #[inline]
418
834
    fn read_literal_from_source(&mut self, remaining_bytes: &[u8]) -> Result<(), Error> {
419
3421
        for expected in remaining_bytes {
420
2673
            let (offset, received) = self.source.next().ok_or(Error {
421
2673
                offset: self.source.offset,
422
2673
                kind: ErrorKind::UnexpectedEof,
423
2673
            })?;
424

            
425
2654
            if received != expected {
426
67
                return Err(Error {
427
67
                    offset,
428
67
                    kind: ErrorKind::Unexpected(*received),
429
67
                });
430
2587
            }
431
        }
432

            
433
748
        Ok(())
434
834
    }
435

            
436
    /// Peeks at the next non-whitespace character, and returns a what kind of
437
    /// token the next token will be if the input is valid JSON syntax.
438
    ///
439
    /// If the source input has no remaining non-whitespace characters, this
440
    /// function returns `None`.
441
    #[inline]
442
7562
    pub fn peek(&mut self) -> Option<PeekableTokenKind> {
443
7562
        self.source.skip_ws();
444

            
445
7562
        let token = self.source.peek()?;
446

            
447
7541
        let kind = match token {
448
521
            b'{' => PeekableTokenKind::Object,
449
1
            b'}' => PeekableTokenKind::ObjectEnd,
450
2918
            b'[' => PeekableTokenKind::Array,
451
182
            b']' => PeekableTokenKind::ArrayEnd,
452
118
            b',' => PeekableTokenKind::Comma,
453
1
            b':' => PeekableTokenKind::Colon,
454
1265
            b'"' => PeekableTokenKind::String,
455
1984
            b'-' | b'0'..=b'9' => PeekableTokenKind::Number,
456
125
            b't' => PeekableTokenKind::True,
457
89
            b'f' => PeekableTokenKind::False,
458
102
            b'n' => PeekableTokenKind::Null,
459
235
            _ => PeekableTokenKind::Unrecognized,
460
        };
461

            
462
7541
        Some(kind)
463
7562
    }
464

            
465
    /// Returns the current byte offset of the tokenizer.
466
    #[inline]
467
    #[must_use]
468
43909
    pub const fn offset(&self) -> usize {
469
43909
        self.source.offset
470
43909
    }
471
}
472

            
473
/// Parses JSON, driven by a [`ParseDelegate`].
474
///
475
/// This is a low-level type. In general, users will want to use either the
476
/// [`Document`](crate::doc::Document) or [`Value`](crate::Value) types instead of
477
/// directly interacting with the parser.
478
///
479
/// This type uses a constant to track whether UTF-8 needs to be manually
480
/// verified or not. This allows the compiler to optimize the `&[u8]` and `&str`
481
/// parsing methods independently.
482
pub struct Parser<'a, const GUARANTEED_UTF8: bool> {
483
    tokenizer: Tokenizer<'a, GUARANTEED_UTF8>,
484
}
485

            
486
impl<'a> Parser<'a, false> {
487
    /// Parses a JSON payload, invoking functions on `delegate` as the payload
488
    /// is parsed.
489
    ///
490
    /// This function verifies that `json` is valid UTF-8 while parsing the
491
    /// JSON.
492
75
    pub fn parse_json_bytes<D>(value: &'a [u8], delegate: D) -> Result<D::Value, Error<D::Error>>
493
75
    where
494
75
        D: ParseDelegate<'a>,
495
75
    {
496
75
        Self::parse_json_bytes_with_config(value, ParseConfig::default(), delegate)
497
75
    }
498

            
499
    /// Parses a JSON payload, invoking functions on `delegate` as the payload
500
    /// is parsed. The parser honors the settings from `config`.
501
    ///
502
    /// This function verifies that `json` is valid UTF-8 while parsing the
503
    /// JSON.
504
537
    pub fn parse_json_bytes_with_config<D>(
505
537
        value: &'a [u8],
506
537
        config: ParseConfig,
507
537
        delegate: D,
508
537
    ) -> Result<D::Value, Error<D::Error>>
509
537
    where
510
537
        D: ParseDelegate<'a>,
511
537
    {
512
537
        Self::parse_bytes(value, config, delegate)
513
537
    }
514

            
515
    /// Validates that `json` contains valid JSON and returns the kind of data
516
    /// the payload contained.
517
1909
    pub fn validate_json_bytes(json: &'a [u8]) -> Result<JsonKind, Error> {
518
1909
        Self::validate_json_bytes_with_config(json, ParseConfig::default())
519
1909
    }
520

            
521
    /// Validates that `json` contains valid JSON, using the settings from
522
    /// `config`, and returns the kind of data the payload contained.
523
2125
    pub fn validate_json_bytes_with_config(
524
2125
        json: &'a [u8],
525
2125
        config: ParseConfig,
526
2125
    ) -> Result<JsonKind, Error> {
527
2125
        Self::parse_bytes(json, config, ())
528
2125
    }
529
}
530

            
531
impl<'a, const GUARANTEED_UTF8: bool> Iterator for Tokenizer<'a, GUARANTEED_UTF8> {
532
    type Item = Result<Token<'a>, Error>;
533

            
534
    #[inline]
535
18
    fn next(&mut self) -> Option<Self::Item> {
536
18
        let (offset, byte) = self.source.next_non_ws()?;
537
15
        Some(self.read_peek(offset, byte))
538
18
    }
539
}
540

            
541
impl<'a> Parser<'a, true> {
542
    /// Parses a JSON payload, invoking functions on `delegate` as the payload
543
    /// is parsed.
544
    ///
545
    /// Because the `str` type guarantees that `json` is valid UTF-8, no
546
    /// additional unicode checks are performed on unescaped unicode sequences.
547
116
    pub fn parse_json<D>(value: &'a str, delegate: D) -> Result<D::Value, Error<D::Error>>
548
116
    where
549
116
        D: ParseDelegate<'a>,
550
116
    {
551
116
        Self::parse_json_with_config(value, ParseConfig::default(), delegate)
552
116
    }
553

            
554
    /// Parses a JSON payload, invoking functions on `delegate` as the payload
555
    /// is parsed. The parser honors the settings from `config`.
556
    ///
557
    /// Because the `str` type guarantees that `json` is valid UTF-8, no
558
    /// additional unicode checks are performed on unescaped unicode sequences.
559
658
    pub fn parse_json_with_config<D>(
560
658
        value: &'a str,
561
658
        config: ParseConfig,
562
658
        delegate: D,
563
658
    ) -> Result<D::Value, Error<D::Error>>
564
658
    where
565
658
        D: ParseDelegate<'a>,
566
658
    {
567
658
        Self::parse_bytes(value.as_bytes(), config, delegate)
568
658
    }
569

            
570
    /// Validates that `json` contains valid JSON and returns the kind of data
571
    /// the payload contained.
572
1
    pub fn validate_json(json: &'a str) -> Result<JsonKind, Error> {
573
1
        Self::validate_json_with_config(json, ParseConfig::default())
574
1
    }
575

            
576
    /// Validates that `json` contains valid JSON, using the settings from
577
    /// `config`, and returns the kind of data the payload contained.
578
217
    pub fn validate_json_with_config(
579
217
        json: &'a str,
580
217
        config: ParseConfig,
581
217
    ) -> Result<JsonKind, Error> {
582
217
        Self::parse_json_with_config(json, config, ())
583
217
    }
584
}
585

            
586
impl<'a, const GUARANTEED_UTF8: bool> Parser<'a, GUARANTEED_UTF8> {
587
3320
    fn parse_bytes<D>(
588
3320
        source: &'a [u8],
589
3320
        config: ParseConfig,
590
3320
        delegate: D,
591
3320
    ) -> Result<D::Value, Error<D::Error>>
592
3320
    where
593
3320
        D: ParseDelegate<'a>,
594
3320
    {
595
3320
        let mut state = ParseState::new(config, delegate);
596
3320
        let mut parser = Self {
597
3320
            tokenizer: Tokenizer::new(source),
598
3320
        };
599
3320
        let value = parser.read_from_source(&mut state)?;
600

            
601
1330
        if !state.config.allow_all_types_at_root
602
34
            && !matches!(
603
191
                state.delegate.kind_of(&value),
604
                JsonKind::Object | JsonKind::Array
605
            )
606
        {
607
34
            return Err(Error {
608
34
                offset: 0,
609
34
                kind: ErrorKind::PayloadsShouldBeObjectOrArray,
610
34
            });
611
1296
        }
612
1296

            
613
1296
        match parser.tokenizer.source.next_non_ws() {
614
1115
            None => Ok(value),
615
181
            Some((offset, _)) => Err(Error {
616
181
                offset,
617
181
                kind: ErrorKind::TrailingNonWhitespace,
618
181
            }),
619
        }
620
3320
    }
621

            
622
    #[inline]
623
3320
    fn read_from_source<D>(
624
3320
        &mut self,
625
3320
        state: &mut ParseState<'a, D>,
626
3320
    ) -> Result<D::Value, Error<D::Error>>
627
3320
    where
628
3320
        D: ParseDelegate<'a>,
629
3320
    {
630
3320
        self.read_tokens(state)
631
3320
    }
632

            
633
    #[inline]
634
11688
    fn read_tokens<D>(&mut self, state: &mut ParseState<'a, D>) -> Result<D::Value, Error<D::Error>>
635
11688
    where
636
11688
        D: ParseDelegate<'a>,
637
11688
    {
638
11688
        let offset = self.tokenizer.offset();
639
11688
        let token = self.tokenizer.next_or_eof().map_err(Error::into_fallable)?;
640

            
641
10567
        let unexpected = |value: u8| {
642
135
            Err(Error {
643
135
                offset,
644
135
                kind: ErrorKind::Unexpected(value),
645
135
            })
646
135
        };
647

            
648
10567
        let into_error = |error: D::Error| Error {
649
16
            offset,
650
16
            kind: ErrorKind::ErrorFromDelegate(error),
651
16
        };
652

            
653
10567
        match token {
654
154
            Token::Null => state.delegate.null().map_err(into_error),
655
            Token::Object => {
656
1317
                state.begin_nest().map_err(|kind| Error { offset, kind })?;
657

            
658
1317
                self.read_object(state).map_err(|mut error| {
659
873
                    if matches!(error.kind, ErrorKind::UnexpectedEof) {
660
63
                        error.kind = ErrorKind::UnclosedObject;
661
810
                    }
662

            
663
873
                    error
664
1317
                })
665
            }
666
            Token::Array => {
667
5289
                state.begin_nest().map_err(|kind| Error { offset, kind })?;
668

            
669
5245
                self.read_array(state).map_err(|mut error| {
670
3757
                    if matches!(error.kind, ErrorKind::UnexpectedEof) {
671
106
                        error.kind = ErrorKind::UnclosedArray;
672
3651
                    }
673

            
674
3757
                    error
675
5245
                })
676
            }
677
1
            Token::ObjectEnd => unexpected(b'}'),
678
6
            Token::ArrayEnd => unexpected(b']'),
679
32
            Token::Colon => unexpected(b':'),
680
96
            Token::Comma => unexpected(b','),
681
371
            Token::Bool(value) => state.delegate.boolean(value).map_err(into_error),
682
1593
            Token::String(value) => state.delegate.string(value).map_err(into_error),
683
1708
            Token::Number(value) => state.delegate.number(value).map_err(into_error),
684
        }
685
11688
    }
686

            
687
    #[inline]
688
1317
    fn read_object<D>(&mut self, state: &mut ParseState<'a, D>) -> Result<D::Value, Error<D::Error>>
689
1317
    where
690
1317
        D: ParseDelegate<'a>,
691
1317
    {
692
1317
        let offset = self.tokenizer.offset();
693
1317
        let mut object = state.delegate.begin_object().map_err(|kind| Error {
694
3
            offset,
695
3
            kind: ErrorKind::ErrorFromDelegate(kind),
696
1317
        })?;
697

            
698
2549
        loop {
699
2549
            let offset = self.tokenizer.offset();
700
2549
            let token = self.tokenizer.next_or_eof().map_err(Error::into_fallable)?;
701

            
702
2430
            let key = match token {
703
                Token::ObjectEnd => {
704
136
                    if state.config.allow_trailing_commas || state.delegate.object_is_empty(&object)
705
                    {
706
103
                        break;
707
33
                    }
708
33

            
709
33
                    return Err(Error {
710
33
                        offset,
711
33
                        kind: ErrorKind::IllegalTrailingComma,
712
33
                    });
713
                }
714
2232
                Token::String(value) => {
715
2232
                    state
716
2232
                        .delegate
717
2232
                        .object_key(&mut object, value)
718
2232
                        .map_err(|kind| Error {
719
1
                            offset,
720
1
                            kind: ErrorKind::ErrorFromDelegate(kind),
721
2232
                        })?
722
                }
723
                Token::Array | Token::Bool(_) | Token::Null | Token::Number(_) | Token::Object => {
724
31
                    return Err(Error {
725
31
                        offset,
726
31
                        kind: ErrorKind::ObjectKeysMustBeStrings,
727
31
                    });
728
                }
729
                _ => {
730
31
                    return Err(Error {
731
31
                        offset,
732
31
                        kind: ErrorKind::ExpectedObjectKey,
733
31
                    })
734
                }
735
            };
736

            
737
2231
            let offset = self.tokenizer.offset();
738
2231
            let token = self.tokenizer.next_or_eof().map_err(|mut error| {
739
13
                error.kind = ErrorKind::ExpectedColon;
740
13

            
741
13
                error.into_fallable()
742
2231
            })?;
743

            
744
2218
            if token != Token::Colon {
745
71
                return Err(Error {
746
71
                    offset,
747
71
                    kind: ErrorKind::ExpectedColon,
748
71
                });
749
2147
            }
750
2147

            
751
2147
            let offset = self.tokenizer.offset();
752
2147
            let value = self.read_tokens(state)?;
753
1669
            state
754
1669
                .delegate
755
1669
                .object_value(&mut object, key, value)
756
1669
                .map_err(|kind| Error {
757
1
                    offset,
758
1
                    kind: ErrorKind::ErrorFromDelegate(kind),
759
1669
                })?;
760

            
761
1668
            let offset = self.tokenizer.offset();
762
1668
            let token = self.tokenizer.next_or_eof().map_err(Error::into_fallable)?;
763

            
764
1610
            if token == Token::ObjectEnd {
765
342
                break;
766
1268
            } else if token != Token::Comma {
767
33
                return Err(Error {
768
33
                    offset,
769
33
                    kind: ErrorKind::ExpectedCommaOrEndOfObject,
770
33
                });
771
1235
            }
772
        }
773

            
774
445
        let object = state.delegate.end_object(object).map_err(|kind| Error {
775
1
            offset: self.tokenizer.offset(),
776
1
            kind: ErrorKind::ErrorFromDelegate(kind),
777
445
        })?;
778

            
779
444
        state.end_nest();
780
444

            
781
444
        Ok(object)
782
1317
    }
783

            
784
    #[inline]
785
5245
    fn read_array<D>(&mut self, state: &mut ParseState<'a, D>) -> Result<D::Value, Error<D::Error>>
786
5245
    where
787
5245
        D: ParseDelegate<'a>,
788
5245
    {
789
5245
        let offset = self.tokenizer.offset();
790
5245
        let mut array = state.delegate.begin_array().map_err(|kind| Error {
791
3
            offset: self.tokenizer.offset(),
792
3
            kind: ErrorKind::ErrorFromDelegate(kind),
793
5245
        })?;
794

            
795
6367
        loop {
796
6367
            let offset = self.tokenizer.offset();
797
6367

            
798
6367
            // a bit of a cheat for the sake of ultimate performance, instead of using an Option<Token>
799
6367
            // we look at the char itself to determine the of a token
800
6367
            if self.tokenizer.peek() == Some(PeekableTokenKind::ArrayEnd) {
801
                // commit the token
802
146
                self.tokenizer.source.offset += 1;
803
146

            
804
146
                if state.config.allow_trailing_commas || state.delegate.array_is_empty(&array) {
805
107
                    break;
806
39
                }
807
39

            
808
39
                return Err(Error {
809
39
                    offset,
810
39
                    kind: ErrorKind::IllegalTrailingComma,
811
39
                });
812
6221
            }
813

            
814
6221
            let value = self.read_tokens(state)?;
815

            
816
2743
            state
817
2743
                .delegate
818
2743
                .array_value(&mut array, value)
819
2743
                .map_err(|kind| Error {
820
1
                    offset: self.tokenizer.offset(),
821
1
                    kind: ErrorKind::ErrorFromDelegate(kind),
822
2743
                })?;
823

            
824
2742
            let offset = self.tokenizer.offset();
825
2742
            let token = self.tokenizer.next_or_eof().map_err(Error::into_fallable)?;
826

            
827
2608
            if token == Token::ArrayEnd {
828
1382
                break;
829
1226
            } else if token != Token::Comma {
830
101
                return Err(Error {
831
101
                    offset,
832
101
                    kind: ErrorKind::ExpectedCommaOrEndOfArray,
833
101
                });
834
1125
            }
835
        }
836

            
837
1489
        let array = state.delegate.end_array(array).map_err(|kind| Error {
838
1
            offset,
839
1
            kind: ErrorKind::ErrorFromDelegate(kind),
840
1489
        })?;
841

            
842
1488
        state.end_nest();
843
1488

            
844
1488
        Ok(array)
845
5245
    }
846
}
847

            
848
/// A delegate for a [`Parser`].
849
///
850
/// This type has its functions invoked while a parser is parsing a JSON
851
/// payload.
852
pub trait ParseDelegate<'a> {
853
    /// The type that can represent all parsed JSON value types.
854
    type Value;
855
    /// The type that is used to represent a JSON object.
856
    type Object;
857
    /// The type that is used to represent a JSON array.
858
    type Array;
859
    /// The type that is used to represent the key of a field in a JSON object.
860
    type Key;
861
    /// The error type for this delegate.
862
    type Error;
863

            
864
    /// Returns the value representation of `null`.
865
    fn null(&mut self) -> Result<Self::Value, Self::Error>;
866
    /// Returns the value representation of a boolean.
867
    fn boolean(&mut self, value: bool) -> Result<Self::Value, Self::Error>;
868
    /// Returns the value representation of a [`JsonNumber`].
869
    fn number(&mut self, value: JsonNumber<'a>) -> Result<Self::Value, Self::Error>;
870
    /// Returns the value representation of a [`JsonString`].
871
    fn string(&mut self, value: JsonString<'a>) -> Result<Self::Value, Self::Error>;
872

            
873
    /// Returns an empty object.
874
    fn begin_object(&mut self) -> Result<Self::Object, Self::Error>;
875
    /// Processes the key for a new value in an object. Returns the key
876
    /// representation of the [`JsonString`].
877
    fn object_key(
878
        &mut self,
879
        object: &mut Self::Object,
880
        key: JsonString<'a>,
881
    ) -> Result<Self::Key, Self::Error>;
882
    /// Adds a new key-value pair to an object.
883
    fn object_value(
884
        &mut self,
885
        object: &mut Self::Object,
886
        key: Self::Key,
887
        value: Self::Value,
888
    ) -> Result<(), Self::Error>;
889
    /// Returns true if the object passed is empty.
890
    fn object_is_empty(&self, object: &Self::Object) -> bool;
891
    /// Returns the value representation of the object passed.
892
    fn end_object(&mut self, object: Self::Object) -> Result<Self::Value, Self::Error>;
893

            
894
    /// Returns an empty array.
895
    fn begin_array(&mut self) -> Result<Self::Array, Self::Error>;
896
    /// Adds a new value to an array.
897
    fn array_value(
898
        &mut self,
899
        array: &mut Self::Array,
900
        value: Self::Value,
901
    ) -> Result<(), Self::Error>;
902
    /// Returns true if the array passed is empty.
903
    fn array_is_empty(&self, array: &Self::Array) -> bool;
904
    /// Returns the value representation of the array passed.
905
    fn end_array(&mut self, array: Self::Array) -> Result<Self::Value, Self::Error>;
906

            
907
    /// Returns the [`JsonKind`] of `value`.
908
    fn kind_of(&self, value: &Self::Value) -> JsonKind;
909
}
910

            
911
struct ByteIterator<'a> {
912
    bytes: &'a [u8],
913
    offset: usize,
914
}
915

            
916
impl<'a> Iterator for ByteIterator<'a> {
917
    type Item = (usize, &'a u8);
918

            
919
    #[inline]
920
99047
    fn next(&mut self) -> Option<Self::Item> {
921
99047
        let token = self.bytes.get(self.offset);
922
99047

            
923
99047
        match token {
924
1371
            None => None,
925
97676
            Some(token) => {
926
97676
                let offset = self.offset;
927
97676
                self.offset += 1;
928
97676

            
929
97676
                Some((offset, token))
930
            }
931
        }
932
99047
    }
933
}
934

            
935
impl<'a> ByteIterator<'a> {
936
    #[inline]
937
3849
    pub fn new(slice: &'a [u8]) -> Self {
938
3849
        Self {
939
3849
            bytes: slice,
940
3849
            offset: 0,
941
3849
        }
942
3849
    }
943

            
944
    #[inline]
945
24658
    pub fn peek(&mut self) -> Option<&'a u8> {
946
24658
        self.bytes.get(self.offset)
947
24658
    }
948

            
949
    #[inline]
950
1314
    fn next_non_ws(&mut self) -> Option<(usize, &'a u8)> {
951
1314
        self.skip_ws();
952
1314

            
953
1314
        self.next()
954
1314
    }
955

            
956
    #[inline]
957
25993
    fn read_non_ws(&mut self) -> Result<(usize, &'a u8), Error> {
958
25993
        self.skip_ws();
959
25993

            
960
25993
        self.read()
961
25993
    }
962

            
963
    #[inline]
964
34869
    fn skip_ws(&mut self) {
965
54109
        loop {
966
54109
            match self.bytes.get(self.offset) {
967
19240
                Some(b' ' | b'\n' | b'\t' | b'\r') => {
968
19240
                    self.offset += 1;
969
19240
                }
970
                _ => {
971
34869
                    return;
972
34869
                }
973
34869
            }
974
34869
        }
975
34869
    }
976

            
977
    #[inline]
978
35295
    fn read(&mut self) -> Result<(usize, &'a u8), Error> {
979
35295
        self.next().ok_or(Error {
980
35295
            offset: self.offset,
981
35295
            kind: ErrorKind::UnexpectedEof,
982
35295
        })
983
35295
    }
984
}
985

            
986
4678
#[derive(Eq, PartialEq, Copy, Clone)]
987
enum InitialNumberState {
988
    Zero,
989
    Digit,
990
    Sign,
991
}
992

            
993
/// A JSON Value parsing configuration.
994
1
#[derive(Debug, Clone, Copy, Eq, PartialEq)]
995
#[non_exhaustive]
996
#[must_use]
997
pub struct ParseConfig {
998
    /// If true, allows trailing commas when parsing arrays and objects. If
999
    /// false, trailing commas will cause an [`ErrorKind::IllegalTrailingComma`]
    /// to be returned.
    pub allow_trailing_commas: bool,
    /// If present, nested arrays and objects will be limited to
    /// `recursion_limit` levels of nesting. If not present, no checks will be
    /// performed which can cause a stack overflow with very deeply nested
    /// payloads.
    pub recursion_limit: Option<usize>,
    /// If true, only arrays or objects will be allowed to parse at the root of
    /// the JSON payload.
    pub allow_all_types_at_root: bool,
}

            
impl Default for ParseConfig {
2540
    fn default() -> Self {
2540
        Self::new()
2540
    }
}

            
impl ParseConfig {
    /// Returns the default configuration:
    ///
    /// ```rust
    /// let config = justjson::parser::ParseConfig::new();
    /// assert_eq!(config.allow_trailing_commas, false);
    /// assert_eq!(config.recursion_limit, Some(128));
    /// assert_eq!(config.allow_all_types_at_root, true);
    /// ```
2540
    pub const fn new() -> Self {
2540
        Self {
2540
            allow_trailing_commas: false,
2540
            recursion_limit: Some(128),
2540
            allow_all_types_at_root: true,
2540
        }
2540
    }

            
    /// Returns a strict configuration, which differs from the default
    /// configuration by only allowing objects and arrays at the root:
    ///
    /// ```rust
    /// let config = justjson::parser::ParseConfig::strict();
    /// assert_eq!(config.allow_trailing_commas, false);
    /// assert_eq!(config.recursion_limit, Some(128));
    /// assert_eq!(config.allow_all_types_at_root, false);
    /// ```
226
    pub const fn strict() -> Self {
226
        Self {
226
            allow_trailing_commas: false,
226
            recursion_limit: Some(128),
226
            allow_all_types_at_root: false,
226
        }
226
    }

            
    /// Disables recursuion limit testing.
    ///
    /// Note: Malicious payloads may be able to cause stack overflows to occur
    /// if this is disabled.
1
    pub const fn without_recursion_limit(mut self) -> Self {
1
        self.recursion_limit = None;
1
        self
1
    }

            
    /// Sets the maximum recursion limit to `limit`.
216
    pub const fn with_recursion_limit(mut self, limit: usize) -> Self {
216
        self.recursion_limit = Some(limit);
216
        self
216
    }

            
    /// Sets whether to allow all types at the root of the JSON payload. If
    /// false, only arrays and objects will be allowed at the root of the JSON
    /// payload.
1
    pub const fn allowing_all_types_at_root(mut self, allow_all: bool) -> Self {
1
        self.allow_all_types_at_root = allow_all;
1
        self
1
    }

            
    /// Allows trailing commas when parsing objects and arrays.
    ///
    /// ```rust
    /// # #[cfg(feature = "alloc")]
    /// # fn wrapper() {
    /// use justjson::parser::ParseConfig;
    /// use justjson::Value;
    ///
    /// let source = r#"{"a":[true,],}"#;
    /// Value::from_json(source).expect_err("not enabled by default");
    /// let config = ParseConfig::new().allowing_trailing_commas();
    /// Value::from_json_with_config(source, config).expect("now parses");
    /// # }
    /// ```
1
    pub const fn allowing_trailing_commas(mut self) -> Self {
1
        self.allow_trailing_commas = true;
1
        self
1
    }
}

            
1
#[test]
1
fn config_test() {
1
    // Flip all the values of the strict config, and verify it matches.
1
    assert_eq!(
1
        ParseConfig::strict()
1
            .allowing_trailing_commas()
1
            .without_recursion_limit()
1
            .allowing_all_types_at_root(true),
1
        ParseConfig {
1
            allow_trailing_commas: true,
1
            recursion_limit: None,
1
            allow_all_types_at_root: true
1
        }
1
    );
1
}

            
#[derive(Debug)]
struct ParseState<'a, D>
where
    D: ParseDelegate<'a>,
{
    delegate: D,
    config: ParseConfig,
    remaining_depth: usize,
    _phantom: PhantomData<&'a ()>,
}

            
impl<'a, D> ParseState<'a, D>
where
    D: ParseDelegate<'a>,
{
    #[inline]
6606
    pub fn begin_nest(&mut self) -> Result<(), ErrorKind<D::Error>> {
6606
        if self.config.recursion_limit.is_some() {
6606
            if self.remaining_depth > 0 {
6562
                self.remaining_depth -= 1;
6562
            } else {
44
                return Err(ErrorKind::RecursionLimitReached);
            }
        }

            
6562
        Ok(())
6606
    }

            
    #[inline]
1932
    pub fn end_nest(&mut self) {
1932
        if self.config.recursion_limit.is_some() {
1932
            self.remaining_depth += 1;
1932
        }
1932
    }
}

            
impl<'a, D> ParseState<'a, D>
where
    D: ParseDelegate<'a>,
{
3320
    fn new(config: ParseConfig, delegate: D) -> Self {
3320
        Self {
3320
            delegate,
3320
            remaining_depth: config.recursion_limit.unwrap_or(usize::MAX),
3320
            config,
3320
            _phantom: PhantomData,
3320
        }
3320
    }
}

            
/// Every type supported by JSON.
2
#[derive(Debug, Eq, PartialEq, Clone, Copy)]
pub enum JsonKind {
    /// A null value.
    Null,
    /// A boolean value.
    Boolean,
    /// A numerical value.
    Number,
    /// A string value.
    String,
    /// A list of key-value pairs.
    Object,
    /// A list of values.
    Array,
}

            
impl<'a> ParseDelegate<'a> for () {
    type Array = usize;
    type Error = Infallible;
    type Key = ();
    type Object = usize;
    type Value = JsonKind;

            
60
    fn null(&mut self) -> Result<Self::Value, Self::Error> {
60
        Ok(JsonKind::Null)
60
    }

            
140
    fn boolean(&mut self, _value: bool) -> Result<Self::Value, Self::Error> {
140
        Ok(JsonKind::Boolean)
140
    }

            
884
    fn number(&mut self, _value: JsonNumber<'a>) -> Result<Self::Value, Self::Error> {
884
        Ok(JsonKind::Number)
884
    }

            
884
    fn string(&mut self, _value: JsonString<'a>) -> Result<Self::Value, Self::Error> {
884
        Ok(JsonKind::String)
884
    }

            
934
    fn begin_object(&mut self) -> Result<Self::Object, Self::Error> {
934
        Ok(0)
934
    }

            
1192
    fn object_key(
1192
        &mut self,
1192
        _object: &mut Self::Object,
1192
        _key: JsonString<'a>,
1192
    ) -> Result<Self::Key, Self::Error> {
1192
        Ok(())
1192
    }

            
700
    fn object_value(
700
        &mut self,
700
        object: &mut Self::Object,
700
        _key: Self::Key,
700
        _value: Self::Value,
700
    ) -> Result<(), Self::Error> {
700
        *object += 1;
700
        Ok(())
700
    }

            
68
    fn object_is_empty(&self, object: &Self::Object) -> bool {
68
        *object == 0
68
    }

            
226
    fn end_object(&mut self, _object: Self::Object) -> Result<Self::Value, Self::Error> {
226
        Ok(JsonKind::Object)
226
    }

            
4106
    fn begin_array(&mut self) -> Result<Self::Array, Self::Error> {
4106
        Ok(0)
4106
    }

            
1560
    fn array_value(
1560
        &mut self,
1560
        array: &mut Self::Array,
1560
        _value: Self::Value,
1560
    ) -> Result<(), Self::Error> {
1560
        *array += 1;
1560
        Ok(())
1560
    }

            
80
    fn array_is_empty(&self, array: &Self::Array) -> bool {
80
        *array == 0
80
    }

            
884
    fn end_array(&mut self, _array: Self::Array) -> Result<Self::Value, Self::Error> {
884
        Ok(JsonKind::Array)
884
    }

            
84
    fn kind_of(&self, value: &Self::Value) -> JsonKind {
84
        *value
84
    }
}

            
1
#[test]
1
fn validates() {
1
    assert_eq!(
1
        Parser::validate_json(r#"{"a":1,"b":true,"c":"hello","d":[],"e":{}}"#),
1
        Ok(JsonKind::Object)
1
    );
1
    assert_eq!(
1
        Parser::validate_json_bytes(br#"{"a":1,"b":true,"c":"hello","d":[],"e":{}}"#),
1
        Ok(JsonKind::Object)
1
    );
1
}

            
1
#[test]
#[cfg(feature = "alloc")]
1
fn tokenizes() {
1
    assert_eq!(
1
        Tokenizer::for_json("true")
1
            .collect::<Result<alloc::vec::Vec<_>, _>>()
1
            .unwrap(),
1
        &[Token::Bool(true)]
1
    );
1
    assert_eq!(
1
        Tokenizer::for_json_bytes(b"false")
1
            .collect::<Result<alloc::vec::Vec<_>, _>>()
1
            .unwrap(),
1
        &[Token::Bool(false)]
1
    );
1
}

            
1
#[test]
1
fn tokenizer_peek_tests() {
1
    let mut tokenizer = Tokenizer::for_json(r#"{"a": [1, true, false, null]} "#);

            
13
    for expected in [
1
        PeekableTokenKind::Object,
1
        PeekableTokenKind::String,
1
        PeekableTokenKind::Colon,
1
        PeekableTokenKind::Array,
1
        PeekableTokenKind::Number,
1
        PeekableTokenKind::Comma,
1
        PeekableTokenKind::True,
1
        PeekableTokenKind::Comma,
1
        PeekableTokenKind::False,
1
        PeekableTokenKind::Comma,
1
        PeekableTokenKind::Null,
1
        PeekableTokenKind::ArrayEnd,
1
        PeekableTokenKind::ObjectEnd,
    ] {
13
        assert_eq!(tokenizer.peek(), Some(expected));
13
        tokenizer.next().expect("eof").expect("invalid tokens");
    }
1
    assert_eq!(tokenizer.peek(), None);
1
    assert_eq!(tokenizer.next(), None);

            
1
    assert_eq!(
1
        Tokenizer::for_json("<").peek(),
1
        Some(PeekableTokenKind::Unrecognized)
1
    );
1
}