JsonScanner.cs 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524
  1. /* Copyright 2010-present MongoDB Inc.
  2. *
  3. * Licensed under the Apache License, Version 2.0 (the "License");
  4. * you may not use this file except in compliance with the License.
  5. * You may obtain a copy of the License at
  6. *
  7. * http://www.apache.org/licenses/LICENSE-2.0
  8. *
  9. * Unless required by applicable law or agreed to in writing, software
  10. * distributed under the License is distributed on an "AS IS" BASIS,
  11. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. * See the License for the specific language governing permissions and
  13. * limitations under the License.
  14. */
  15. using System;
  16. using System.IO;
  17. using System.Text;
  18. namespace MongoDB.Bson.IO
  19. {
  20. /// <summary>
  21. /// A static class that represents a JSON scanner.
  22. /// </summary>
  23. internal static class JsonScanner
  24. {
  25. // public static methods
  26. /// <summary>
  27. /// Gets the next JsonToken from a JsonBuffer.
  28. /// </summary>
  29. /// <param name="buffer">The buffer.</param>
  30. /// <returns>The next token.</returns>
  31. public static JsonToken GetNextToken(JsonBuffer buffer)
  32. {
  33. // skip leading whitespace
  34. var c = buffer.Read();
  35. while (c != -1 && char.IsWhiteSpace((char)c))
  36. {
  37. c = buffer.Read();
  38. }
  39. if (c == -1)
  40. {
  41. return new JsonToken(JsonTokenType.EndOfFile, "<eof>");
  42. }
  43. // leading character determines token type
  44. switch (c)
  45. {
  46. case '{': return new JsonToken(JsonTokenType.BeginObject, "{");
  47. case '}': return new JsonToken(JsonTokenType.EndObject, "}");
  48. case '[': return new JsonToken(JsonTokenType.BeginArray, "[");
  49. case ']': return new JsonToken(JsonTokenType.EndArray, "]");
  50. case '(': return new JsonToken(JsonTokenType.LeftParen, "(");
  51. case ')': return new JsonToken(JsonTokenType.RightParen, ")");
  52. case ':': return new JsonToken(JsonTokenType.Colon, ":");
  53. case ',': return new JsonToken(JsonTokenType.Comma, ",");
  54. case '\'':
  55. case '"':
  56. return GetStringToken(buffer, (char)c);
  57. case '/': return GetRegularExpressionToken(buffer);
  58. default:
  59. if (c == '-' || char.IsDigit((char)c))
  60. {
  61. return GetNumberToken(buffer, c);
  62. }
  63. else if (c == '$' || c == '_' || char.IsLetter((char)c))
  64. {
  65. return GetUnquotedStringToken(buffer);
  66. }
  67. else
  68. {
  69. buffer.UnRead(c);
  70. throw new FormatException(FormatMessage("Invalid JSON input", buffer, buffer.Position));
  71. }
  72. }
  73. }
  74. // private methods
  75. private static string FormatMessage(string message, JsonBuffer buffer, int start)
  76. {
  77. var maxLength = 20;
  78. var snippet = buffer.GetSnippet(start, maxLength);
  79. return string.Format("{0} '{1}'.", message, snippet);
  80. }
  81. private static JsonToken GetNumberToken(JsonBuffer buffer, int firstChar)
  82. {
  83. var c = firstChar;
  84. // leading digit or '-' has already been read
  85. var start = buffer.Position - 1;
  86. NumberState state;
  87. switch (c)
  88. {
  89. case '-': state = NumberState.SawLeadingMinus; break;
  90. case '0': state = NumberState.SawLeadingZero; break;
  91. default: state = NumberState.SawIntegerDigits; break;
  92. }
  93. var type = JsonTokenType.Int64; // assume integer until proved otherwise
  94. while (true)
  95. {
  96. c = buffer.Read();
  97. switch (state)
  98. {
  99. case NumberState.SawLeadingMinus:
  100. switch (c)
  101. {
  102. case '0':
  103. state = NumberState.SawLeadingZero;
  104. break;
  105. case 'I':
  106. state = NumberState.SawMinusI;
  107. break;
  108. default:
  109. if (char.IsDigit((char)c))
  110. {
  111. state = NumberState.SawIntegerDigits;
  112. }
  113. else
  114. {
  115. state = NumberState.Invalid;
  116. }
  117. break;
  118. }
  119. break;
  120. case NumberState.SawLeadingZero:
  121. switch (c)
  122. {
  123. case '.':
  124. state = NumberState.SawDecimalPoint;
  125. break;
  126. case 'e':
  127. case 'E':
  128. state = NumberState.SawExponentLetter;
  129. break;
  130. case ',':
  131. case '}':
  132. case ']':
  133. case ')':
  134. case -1:
  135. state = NumberState.Done;
  136. break;
  137. default:
  138. if (char.IsWhiteSpace((char)c))
  139. {
  140. state = NumberState.Done;
  141. }
  142. else
  143. {
  144. state = NumberState.Invalid;
  145. }
  146. break;
  147. }
  148. break;
  149. case NumberState.SawIntegerDigits:
  150. switch (c)
  151. {
  152. case '.':
  153. state = NumberState.SawDecimalPoint;
  154. break;
  155. case 'e':
  156. case 'E':
  157. state = NumberState.SawExponentLetter;
  158. break;
  159. case ',':
  160. case '}':
  161. case ']':
  162. case ')':
  163. case -1:
  164. state = NumberState.Done;
  165. break;
  166. default:
  167. if (char.IsDigit((char)c))
  168. {
  169. state = NumberState.SawIntegerDigits;
  170. }
  171. else if (char.IsWhiteSpace((char)c))
  172. {
  173. state = NumberState.Done;
  174. }
  175. else
  176. {
  177. state = NumberState.Invalid;
  178. }
  179. break;
  180. }
  181. break;
  182. case NumberState.SawDecimalPoint:
  183. type = JsonTokenType.Double;
  184. if (char.IsDigit((char)c))
  185. {
  186. state = NumberState.SawFractionDigits;
  187. }
  188. else
  189. {
  190. state = NumberState.Invalid;
  191. }
  192. break;
  193. case NumberState.SawFractionDigits:
  194. switch (c)
  195. {
  196. case 'e':
  197. case 'E':
  198. state = NumberState.SawExponentLetter;
  199. break;
  200. case ',':
  201. case '}':
  202. case ']':
  203. case ')':
  204. case -1:
  205. state = NumberState.Done;
  206. break;
  207. default:
  208. if (char.IsDigit((char)c))
  209. {
  210. state = NumberState.SawFractionDigits;
  211. }
  212. else if (char.IsWhiteSpace((char)c))
  213. {
  214. state = NumberState.Done;
  215. }
  216. else
  217. {
  218. state = NumberState.Invalid;
  219. }
  220. break;
  221. }
  222. break;
  223. case NumberState.SawExponentLetter:
  224. type = JsonTokenType.Double;
  225. switch (c)
  226. {
  227. case '+':
  228. case '-':
  229. state = NumberState.SawExponentSign;
  230. break;
  231. default:
  232. if (char.IsDigit((char)c))
  233. {
  234. state = NumberState.SawExponentDigits;
  235. }
  236. else
  237. {
  238. state = NumberState.Invalid;
  239. }
  240. break;
  241. }
  242. break;
  243. case NumberState.SawExponentSign:
  244. if (char.IsDigit((char)c))
  245. {
  246. state = NumberState.SawExponentDigits;
  247. }
  248. else
  249. {
  250. state = NumberState.Invalid;
  251. }
  252. break;
  253. case NumberState.SawExponentDigits:
  254. switch (c)
  255. {
  256. case ',':
  257. case '}':
  258. case ']':
  259. case ')':
  260. case -1:
  261. state = NumberState.Done;
  262. break;
  263. default:
  264. if (char.IsDigit((char)c))
  265. {
  266. state = NumberState.SawExponentDigits;
  267. }
  268. else if (char.IsWhiteSpace((char)c))
  269. {
  270. state = NumberState.Done;
  271. }
  272. else
  273. {
  274. state = NumberState.Invalid;
  275. }
  276. break;
  277. }
  278. break;
  279. case NumberState.SawMinusI:
  280. var sawMinusInfinity = true;
  281. var nfinity = new char[] { 'n', 'f', 'i', 'n', 'i', 't', 'y' };
  282. for (var i = 0; i < nfinity.Length; i++)
  283. {
  284. if (c != nfinity[i])
  285. {
  286. sawMinusInfinity = false;
  287. break;
  288. }
  289. c = buffer.Read();
  290. }
  291. if (sawMinusInfinity)
  292. {
  293. type = JsonTokenType.Double;
  294. switch (c)
  295. {
  296. case ',':
  297. case '}':
  298. case ']':
  299. case ')':
  300. case -1:
  301. state = NumberState.Done;
  302. break;
  303. default:
  304. if (char.IsWhiteSpace((char)c))
  305. {
  306. state = NumberState.Done;
  307. }
  308. else
  309. {
  310. state = NumberState.Invalid;
  311. }
  312. break;
  313. }
  314. }
  315. else
  316. {
  317. state = NumberState.Invalid;
  318. }
  319. break;
  320. }
  321. switch (state)
  322. {
  323. case NumberState.Done:
  324. buffer.UnRead(c);
  325. var lexeme = buffer.GetSubstring(start, buffer.Position - start);
  326. if (type == JsonTokenType.Double)
  327. {
  328. var value = JsonConvert.ToDouble(lexeme);
  329. return new DoubleJsonToken(lexeme, value);
  330. }
  331. else
  332. {
  333. var value = JsonConvert.ToInt64(lexeme);
  334. if (value < int.MinValue || value > int.MaxValue)
  335. {
  336. return new Int64JsonToken(lexeme, value);
  337. }
  338. else
  339. {
  340. return new Int32JsonToken(lexeme, (int)value);
  341. }
  342. }
  343. case NumberState.Invalid:
  344. throw new FormatException(FormatMessage("Invalid JSON number", buffer, start));
  345. }
  346. }
  347. }
  348. private static JsonToken GetRegularExpressionToken(JsonBuffer buffer)
  349. {
  350. // opening slash has already been read
  351. var start = buffer.Position - 1;
  352. var state = RegularExpressionState.InPattern;
  353. while (true)
  354. {
  355. var c = buffer.Read();
  356. switch (state)
  357. {
  358. case RegularExpressionState.InPattern:
  359. switch (c)
  360. {
  361. case '/': state = RegularExpressionState.InOptions; break;
  362. case '\\': state = RegularExpressionState.InEscapeSequence; break;
  363. case -1: state = RegularExpressionState.Invalid; break;
  364. default: state = RegularExpressionState.InPattern; break;
  365. }
  366. break;
  367. case RegularExpressionState.InEscapeSequence:
  368. state = RegularExpressionState.InPattern;
  369. break;
  370. case RegularExpressionState.InOptions:
  371. switch (c)
  372. {
  373. case 'i':
  374. case 'm':
  375. case 'x':
  376. case 's':
  377. state = RegularExpressionState.InOptions;
  378. break;
  379. case ',':
  380. case '}':
  381. case ']':
  382. case ')':
  383. case -1:
  384. state = RegularExpressionState.Done;
  385. break;
  386. default:
  387. if (char.IsWhiteSpace((char)c))
  388. {
  389. state = RegularExpressionState.Done;
  390. }
  391. else
  392. {
  393. state = RegularExpressionState.Invalid;
  394. }
  395. break;
  396. }
  397. break;
  398. }
  399. switch (state)
  400. {
  401. case RegularExpressionState.Done:
  402. buffer.UnRead(c);
  403. var lexeme = buffer.GetSubstring(start, buffer.Position - start);
  404. var regex = new BsonRegularExpression(lexeme);
  405. return new RegularExpressionJsonToken(lexeme, regex);
  406. case RegularExpressionState.Invalid:
  407. throw new FormatException(FormatMessage("Invalid JSON regular expression", buffer, start));
  408. }
  409. }
  410. }
  411. private static JsonToken GetStringToken(JsonBuffer buffer, char quoteCharacter)
  412. {
  413. // opening quote has already been read
  414. var start = buffer.Position - 1;
  415. var sb = new StringBuilder();
  416. while (true)
  417. {
  418. var c = buffer.Read();
  419. switch (c)
  420. {
  421. case '\\':
  422. c = buffer.Read();
  423. switch (c)
  424. {
  425. case '\'': sb.Append('\''); break;
  426. case '"': sb.Append('"'); break;
  427. case '\\': sb.Append('\\'); break;
  428. case '/': sb.Append('/'); break;
  429. case 'b': sb.Append('\b'); break;
  430. case 'f': sb.Append('\f'); break;
  431. case 'n': sb.Append('\n'); break;
  432. case 'r': sb.Append('\r'); break;
  433. case 't': sb.Append('\t'); break;
  434. case 'u':
  435. var u1 = buffer.Read();
  436. var u2 = buffer.Read();
  437. var u3 = buffer.Read();
  438. var u4 = buffer.Read();
  439. if (u4 != -1)
  440. {
  441. var hex = new string(new char[] { (char)u1, (char)u2, (char)u3, (char)u4 });
  442. var n = Convert.ToInt32(hex, 16);
  443. sb.Append((char)n);
  444. }
  445. break;
  446. default:
  447. if (c != -1)
  448. {
  449. var message = string.Format("Invalid escape sequence in JSON string '\\{0}'.", (char)c);
  450. throw new FormatException(message);
  451. }
  452. break;
  453. }
  454. break;
  455. default:
  456. if (c == quoteCharacter)
  457. {
  458. var lexeme = buffer.GetSubstring(start, buffer.Position - start);
  459. return new StringJsonToken(JsonTokenType.String, lexeme, sb.ToString());
  460. }
  461. if (c != -1)
  462. {
  463. sb.Append((char)c);
  464. }
  465. break;
  466. }
  467. if (c == -1)
  468. {
  469. throw new FormatException(FormatMessage("End of file in JSON string.", buffer, start));
  470. }
  471. }
  472. }
  473. private static JsonToken GetUnquotedStringToken(JsonBuffer buffer)
  474. {
  475. // opening letter or $ has already been read
  476. var start = buffer.Position - 1;
  477. var c = buffer.Read();
  478. while (c == '$' || c == '_' || char.IsLetterOrDigit((char)c))
  479. {
  480. c = buffer.Read();
  481. }
  482. buffer.UnRead(c);
  483. var lexeme = buffer.GetSubstring(start, buffer.Position - start);
  484. return new StringJsonToken(JsonTokenType.UnquotedString, lexeme, lexeme);
  485. }
  486. // nested types
  487. private enum NumberState
  488. {
  489. SawLeadingMinus,
  490. SawLeadingZero,
  491. SawIntegerDigits,
  492. SawDecimalPoint,
  493. SawFractionDigits,
  494. SawExponentLetter,
  495. SawExponentSign,
  496. SawExponentDigits,
  497. SawMinusI,
  498. Done,
  499. Invalid
  500. }
  501. private enum RegularExpressionState
  502. {
  503. InPattern,
  504. InEscapeSequence,
  505. InOptions,
  506. Done,
  507. Invalid
  508. }
  509. }
  510. }