You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

62 lines
1.6 KiB

4 years ago
  1. 'use strict';
  2. /**
  3. * Checks if a given buffer contains only correct UTF-8.
  4. * Ported from https://www.cl.cam.ac.uk/%7Emgk25/ucs/utf8_check.c by
  5. * Markus Kuhn.
  6. *
  7. * @param {Buffer} buf The buffer to check
  8. * @return {Boolean} `true` if `buf` contains only correct UTF-8, else `false`
  9. * @public
  10. */
  11. const isValidUTF8 = (buf) => {
  12. var len = buf.length;
  13. var i = 0;
  14. while (i < len) {
  15. if (buf[i] < 0x80) { // 0xxxxxxx
  16. i++;
  17. } else if ((buf[i] & 0xe0) === 0xc0) { // 110xxxxx 10xxxxxx
  18. if (
  19. i + 1 === len ||
  20. (buf[i + 1] & 0xc0) !== 0x80 ||
  21. (buf[i] & 0xfe) === 0xc0 // overlong
  22. ) {
  23. return false;
  24. } else {
  25. i += 2;
  26. }
  27. } else if ((buf[i] & 0xf0) === 0xe0) { // 1110xxxx 10xxxxxx 10xxxxxx
  28. if (
  29. i + 2 >= len ||
  30. (buf[i + 1] & 0xc0) !== 0x80 ||
  31. (buf[i + 2] & 0xc0) !== 0x80 ||
  32. buf[i] === 0xe0 && (buf[i + 1] & 0xe0) === 0x80 || // overlong
  33. buf[i] === 0xed && (buf[i + 1] & 0xe0) === 0xa0 // surrogate (U+D800 - U+DFFF)
  34. ) {
  35. return false;
  36. } else {
  37. i += 3;
  38. }
  39. } else if ((buf[i] & 0xf8) === 0xf0) { // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  40. if (
  41. i + 3 >= len ||
  42. (buf[i + 1] & 0xc0) !== 0x80 ||
  43. (buf[i + 2] & 0xc0) !== 0x80 ||
  44. (buf[i + 3] & 0xc0) !== 0x80 ||
  45. buf[i] === 0xf0 && (buf[i + 1] & 0xf0) === 0x80 || // overlong
  46. buf[i] === 0xf4 && buf[i + 1] > 0x8f || buf[i] > 0xf4 // > U+10FFFF
  47. ) {
  48. return false;
  49. } else {
  50. i += 4;
  51. }
  52. } else {
  53. return false;
  54. }
  55. }
  56. return true;
  57. };
  58. module.exports = isValidUTF8;