commit 5c7ee7c4405575e076aa522bfe902446f81cdaa5 Author: Vitaliy Filippov Date: Sun Nov 11 22:36:36 2018 +0300 Streaming JSON parser diff --git a/JSONStream.php b/JSONStream.php new file mode 100644 index 0000000..2372f7b --- /dev/null +++ b/JSONStream.php @@ -0,0 +1,416 @@ + '"', + "\\" => "\\", + '/' => '/', + 'b' => "\b", + 'f' => "\f", + 'n' => "\n", + 'r' => "\r", + 't' => "\t", + ]; + + protected $buffer = ''; + protected $totalOffset = 0; + protected $offset = 0; + protected $in = []; // [ [ 0 => extern or eof, 1 => type, 2 => value, 3 => current hash key ] ] + protected $read; + + public function __construct($read) + { + $this->read = $read; + } + + public function enterObject() + { + $this->skipWhitespace(); + if ($this->buffer === '') + { + throw new JSONStreamException('unexpected end of stream at offset '.($this->totalOffset + $this->offset)); + } + if (($c = $this->buffer[$this->offset]) !== '{') + { + throw new JSONStreamException('unexpected token at offset '.($this->totalOffset + $this->offset).': '.$c); + } + $this->offset++; + $this->skipWhitespace(); + $st = $this->buffer[$this->offset] === '}' ? 2 : 1; + $this->in[] = [ $st, self::OBJ, NULL, false ]; + } + + public function enterArray() + { + $this->skipWhitespace(); + if ($this->buffer === '') + { + throw new JSONStreamException('unexpected end of stream at offset '.($this->totalOffset + $this->offset)); + } + if (($c = $this->buffer[$this->offset]) !== '[') + { + throw new JSONStreamException('unexpected token at offset '.($this->totalOffset + $this->offset).': '.$c); + } + $this->offset++; + $this->skipWhitespace(); + $st = $this->buffer[$this->offset] === ']' ? 2 : 1; + $this->in[] = [ $st, self::ARR, NULL, false ]; + } + + public function exitObject() + { + if (!$this->isEnded()) + { + throw new JSONStreamException('object not ended yet'); + } + if ($this->in[count($this->in)-1][1] != self::OBJ) + { + throw new JSONStreamException('not inside object'); + } + array_pop($this->in); + if ($this->in) + { + $this->pushValue($v); + } + } + + public function exitArray() + { + if (!$this->isEnded()) + { + throw new JSONStreamException('array not ended yet'); + } + if ($this->in[count($this->in)-1][1] != self::ARR) + { + throw new JSONStreamException('not inside array'); + } + array_pop($this->in); + if ($this->in) + { + $this->pushValue($v); + } + } + + public function isEnded() + { + return !$this->in || $this->in[count($this->in)-1][0] == 2; + } + + public function readValue(&$value) + { + if ($this->isEnded()) + { + // Refuse to read anything until exitObject() / exitArray() + return false; + } + $n = count($this->in); + do + { + $v = $this->readToken(); + } while (count($this->in) > $n); + $value = $v; + return ($this->in[count($this->in)-1][0] == 1); + } + + public function unreadBuffer() + { + $this->totalOffset += strlen($this->buffer); + $s = $this->buffer; + $this->buffer = ''; + $this->offset = 0; + return $s; + } + + public function restart() + { + $this->totalOffset = 0; + $this->buffer = ''; + $this->offset = 0; + $this->in = []; + } + + protected function read() + { + $r = $this->read; + return $r(); + } + + protected function skipWhitespace() + { + while (true) + { + if ($this->offset == strlen($this->buffer)) + { + $this->totalOffset += strlen($this->buffer); + $this->buffer = $this->read(); + $this->offset = 0; + } + $c = $this->buffer[$this->offset]; + if (ctype_space($c)) + { + $m = NULL; + preg_match('/\s+/sA', $this->buffer, $m, 0, $this->offset); + if (!$m) + { + return; + } + $this->offset += strlen($m[0]); + } + else + { + return; + } + } + } + + protected function readToken() + { + $this->skipWhitespace(); + if ($this->buffer === '') + { + throw new JSONStreamException('unexpected end of stream at offset '.($this->totalOffset + $this->offset)); + } + $c = $this->buffer[$this->offset]; + if ($c === '[') + { + $this->offset++; + $this->skipWhitespace(); + if ($this->buffer[$this->offset] === ']') + { + $this->offset++; + $v = []; + } + else + { + $this->in[] = [ 0, self::ARR, [], NULL ]; + return NULL; + } + } + elseif ($c === '{') + { + $this->offset++; + $this->skipWhitespace(); + if ($this->buffer[$this->offset] === '}') + { + $this->offset++; + $v = []; + } + else + { + $this->in[] = [ 0, self::OBJ, [], NULL ]; + return NULL; + } + } + elseif ($c === '"') + { + $this->offset++; + $v = ''; + if ($this->offset >= strlen($this->buffer)-6) + { + $this->buffer .= $this->read(); + } + while (preg_match('/(?:[^"\\\\]+|\\\\[\\\\"\/bfnrt]|(?:\\\\u[0-9a-fA-F]{4})+)/As', $this->buffer, $m, 0, $this->offset)) + { + $this->offset += strlen($m[0]); + if ($m[0][0] == "\\") + { + if ($m[0] == 'u') + $v .= mb_convert_encoding(pack('H*', str_replace('\\u', '', $m[0])), 'UTF-8', 'UTF-16BE'); + else + { + $v .= self::$esc[$m[0][1]]; + } + } + else + { + $v .= $m[0]; + } + if ($this->offset >= strlen($this->buffer)-6) + { + $this->buffer .= $this->read(); + } + } + if (($m = substr($this->buffer, $this->offset, 1)) != '"') + { + if ($m) + throw new JSONStreamException('unexpected token at offset '.($this->totalOffset + $this->offset).': '.$m); + else + throw new JSONStreamException('unexpected end of stream at offset '.($this->totalOffset + $this->offset)); + } + else + { + $this->offset++; + } + } + elseif ($c === 't') + { + if ($this->offset >= strlen($this->buffer)-5) + { + $this->buffer .= $this->read(); + } + if (substr($this->buffer, $this->offset, 4) !== 'true') + { + throw new JSONStreamException('unexpected token at offset '.($this->totalOffset + $this->offset).': '.$c); + } + $this->offset += 4; + $v = true; + } + elseif ($c === 'f') + { + if ($this->offset >= strlen($this->buffer)-6) + { + $this->buffer .= $this->read(); + } + if (substr($this->buffer, $this->offset, 5) !== 'false') + { + throw new JSONStreamException('unexpected token at offset '.($this->totalOffset + $this->offset).': '.$c); + } + $this->offset += 5; + $v = false; + } + elseif ($c === 'n') + { + if ($this->offset >= strlen($this->buffer)-5) + { + $this->buffer .= $this->read(); + } + if (substr($this->buffer, $this->offset, 4) !== 'null') + { + throw new JSONStreamException('unexpected token at offset '.($this->totalOffset + $this->offset).': '.$c); + } + $this->offset += 4; + $v = NULL; + } + elseif (ctype_digit($c) || $c == '-') + { + if ($this->offset >= strlen($this->buffer)-64) + { + $this->buffer .= $this->read(); + } + if (!preg_match('/-?(?:\d+)(?:\.\d+)?(?:[Ee][\+\-]?\d+)?/As', $this->buffer, $m, 0, $this->offset)) + { + throw new JSONStreamException('unexpected token at offset '.($this->totalOffset + $this->offset).': '.$c); + } + $this->offset += strlen($m[0]); + $v = 0+$m[0]; + } + else + { + throw new JSONStreamException('unexpected token at offset '.($this->totalOffset + $this->offset).': '.$c); + } + $this->pushValue($v); + return $v; + } + + protected function pushValue(&$v) + { + redo: + $last = &$this->in[count($this->in)-1]; + if ($last[1] == self::ARR) + { + if (!$last[0]) + { + $last[2][] = $v; + } + $this->skipWhitespace(); + if ($this->buffer[$this->offset] == ',') + { + $this->offset++; + } + $this->skipWhitespace(); + if ($this->buffer[$this->offset] == ']') + { + $this->offset++; + if ($last[0] == 0) + { + $v = $last[2]; + array_pop($this->in); + if ($this->in) + { + goto redo; + } + } + else + { + $last[0] = 2; // end of value, caller must call exitObject / exitArray + } + } + } + elseif ($last[1] == self::OBJ) + { + if (!$last[3]) + { + if (!is_string($v)) + { + if (is_array($v)) + { + throw new JSONStreamException('object key must be a string, but it is '.json_encode($v, JSON_UNESCAPED_UNICODE|JSON_UNESCAPED_SLASHES)); + } + elseif ($v === true) + { + $v = 'true'; + } + elseif ($v === false) + { + $v = 'false'; + } + elseif ($v === NULL) + { + $v = 'null'; + } + } + if ($last[0]) + { + $last[3] = true; + } + else + { + $last[3] = $v; + } + $this->skipWhitespace(); + if (($c = $this->buffer[$this->offset]) != ':') + { + throw new JSONStreamException('unexpected token at offset '.($this->totalOffset + $this->offset).': '.$c); + } + $this->offset++; + } + else + { + if (!$last[0]) + { + $last[2][$last[3]] = $v; + } + $last[3] = NULL; + $this->skipWhitespace(); + if ($this->buffer[$this->offset] == ',') + { + $this->offset++; + } + $this->skipWhitespace(); + if ($this->buffer[$this->offset] == '}') + { + $this->offset++; + if ($last[0] == 0) + { + $v = $last[2]; + array_pop($this->in); + if ($this->in) + { + goto redo; + } + } + else + { + $last[0] = 2; // end of value, caller must call exitObject / exitArray + } + } + } + } + } +} diff --git a/README.md b/README.md new file mode 100644 index 0000000..4b767d9 --- /dev/null +++ b/README.md @@ -0,0 +1,61 @@ +# Fast PHP Streaming JSON parser + +I found 2 implementations of streaming JSON parsers for PHP: + +https://github.com/skolodyazhnyy/json-stream + +https://github.com/salsify/jsonstreamingparser + +Both are bad: + +- json-stream reads and parses the input stream character by character. +- jsonstreamingparser reads the input stream line by line, but still parses it character by character. +- jsonstreamingparser is SAX-like, inconvenient to use. +- both require filehandle to work, can't work with mock function. + +Quick tests show that json-stream requires 4.18s to parse a sample 13MB json file, jsonstreamingparser +is even worse and requires 6.05s. + +This library does the same thing in 1.04s. + +No composer, PHP is not the place for npm. + +## Usage + +[enterArray|enterObject ...] readValue, readValue, readValue [exitArray|exitObject] + +``` +enterArray(); +while (!$json->isEnded()) +{ + $json->enterObject(); + while ($json->readValue($k)) + { + $json->readValue($v); + if ($k == 'Caption') + { + print "$v\n"; + } + } + $json->exitObject(); +} +$json->exitArray(); +fclose($fp); + +$fp = fopen('datasets.json', 'r'); +$json = new JSONStream(function() use($fp) { return fread($fp, 262144); }); +$json->enterArray(); +while (!$json->isEnded()) +{ + $json->readValue($v); + print $v['Caption']."\n"; +} +$json->exitArray(); +fclose($fp); +```