index.js 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340
  1. 'use strict';
  2. var parsePath = require('parse-path');
  3. function _interopDefaultLegacy (e) { return e && typeof e === 'object' && 'default' in e ? e : { 'default': e }; }
  4. var parsePath__default = /*#__PURE__*/_interopDefaultLegacy(parsePath);
  5. // https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URIs
  6. const DATA_URL_DEFAULT_MIME_TYPE = 'text/plain';
  7. const DATA_URL_DEFAULT_CHARSET = 'us-ascii';
  8. const testParameter = (name, filters) => filters.some(filter => filter instanceof RegExp ? filter.test(name) : filter === name);
  9. const normalizeDataURL = (urlString, {stripHash}) => {
  10. const match = /^data:(?<type>[^,]*?),(?<data>[^#]*?)(?:#(?<hash>.*))?$/.exec(urlString);
  11. if (!match) {
  12. throw new Error(`Invalid URL: ${urlString}`);
  13. }
  14. let {type, data, hash} = match.groups;
  15. const mediaType = type.split(';');
  16. hash = stripHash ? '' : hash;
  17. let isBase64 = false;
  18. if (mediaType[mediaType.length - 1] === 'base64') {
  19. mediaType.pop();
  20. isBase64 = true;
  21. }
  22. // Lowercase MIME type
  23. const mimeType = (mediaType.shift() || '').toLowerCase();
  24. const attributes = mediaType
  25. .map(attribute => {
  26. let [key, value = ''] = attribute.split('=').map(string => string.trim());
  27. // Lowercase `charset`
  28. if (key === 'charset') {
  29. value = value.toLowerCase();
  30. if (value === DATA_URL_DEFAULT_CHARSET) {
  31. return '';
  32. }
  33. }
  34. return `${key}${value ? `=${value}` : ''}`;
  35. })
  36. .filter(Boolean);
  37. const normalizedMediaType = [
  38. ...attributes,
  39. ];
  40. if (isBase64) {
  41. normalizedMediaType.push('base64');
  42. }
  43. if (normalizedMediaType.length > 0 || (mimeType && mimeType !== DATA_URL_DEFAULT_MIME_TYPE)) {
  44. normalizedMediaType.unshift(mimeType);
  45. }
  46. return `data:${normalizedMediaType.join(';')},${isBase64 ? data.trim() : data}${hash ? `#${hash}` : ''}`;
  47. };
  48. function normalizeUrl(urlString, options) {
  49. options = {
  50. defaultProtocol: 'http:',
  51. normalizeProtocol: true,
  52. forceHttp: false,
  53. forceHttps: false,
  54. stripAuthentication: true,
  55. stripHash: false,
  56. stripTextFragment: true,
  57. stripWWW: true,
  58. removeQueryParameters: [/^utm_\w+/i],
  59. removeTrailingSlash: true,
  60. removeSingleSlash: true,
  61. removeDirectoryIndex: false,
  62. sortQueryParameters: true,
  63. ...options,
  64. };
  65. urlString = urlString.trim();
  66. // Data URL
  67. if (/^data:/i.test(urlString)) {
  68. return normalizeDataURL(urlString, options);
  69. }
  70. if (/^view-source:/i.test(urlString)) {
  71. throw new Error('`view-source:` is not supported as it is a non-standard protocol');
  72. }
  73. const hasRelativeProtocol = urlString.startsWith('//');
  74. const isRelativeUrl = !hasRelativeProtocol && /^\.*\//.test(urlString);
  75. // Prepend protocol
  76. if (!isRelativeUrl) {
  77. urlString = urlString.replace(/^(?!(?:\w+:)?\/\/)|^\/\//, options.defaultProtocol);
  78. }
  79. const urlObject = new URL(urlString);
  80. if (options.forceHttp && options.forceHttps) {
  81. throw new Error('The `forceHttp` and `forceHttps` options cannot be used together');
  82. }
  83. if (options.forceHttp && urlObject.protocol === 'https:') {
  84. urlObject.protocol = 'http:';
  85. }
  86. if (options.forceHttps && urlObject.protocol === 'http:') {
  87. urlObject.protocol = 'https:';
  88. }
  89. // Remove auth
  90. if (options.stripAuthentication) {
  91. urlObject.username = '';
  92. urlObject.password = '';
  93. }
  94. // Remove hash
  95. if (options.stripHash) {
  96. urlObject.hash = '';
  97. } else if (options.stripTextFragment) {
  98. urlObject.hash = urlObject.hash.replace(/#?:~:text.*?$/i, '');
  99. }
  100. // Remove duplicate slashes if not preceded by a protocol
  101. // NOTE: This could be implemented using a single negative lookbehind
  102. // regex, but we avoid that to maintain compatibility with older js engines
  103. // which do not have support for that feature.
  104. if (urlObject.pathname) {
  105. // TODO: Replace everything below with `urlObject.pathname = urlObject.pathname.replace(/(?<!\b[a-z][a-z\d+\-.]{1,50}:)\/{2,}/g, '/');` when Safari supports negative lookbehind.
  106. // Split the string by occurrences of this protocol regex, and perform
  107. // duplicate-slash replacement on the strings between those occurrences
  108. // (if any).
  109. const protocolRegex = /\b[a-z][a-z\d+\-.]{1,50}:\/\//g;
  110. let lastIndex = 0;
  111. let result = '';
  112. for (;;) {
  113. const match = protocolRegex.exec(urlObject.pathname);
  114. if (!match) {
  115. break;
  116. }
  117. const protocol = match[0];
  118. const protocolAtIndex = match.index;
  119. const intermediate = urlObject.pathname.slice(lastIndex, protocolAtIndex);
  120. result += intermediate.replace(/\/{2,}/g, '/');
  121. result += protocol;
  122. lastIndex = protocolAtIndex + protocol.length;
  123. }
  124. const remnant = urlObject.pathname.slice(lastIndex, urlObject.pathname.length);
  125. result += remnant.replace(/\/{2,}/g, '/');
  126. urlObject.pathname = result;
  127. }
  128. // Decode URI octets
  129. if (urlObject.pathname) {
  130. try {
  131. urlObject.pathname = decodeURI(urlObject.pathname);
  132. } catch {}
  133. }
  134. // Remove directory index
  135. if (options.removeDirectoryIndex === true) {
  136. options.removeDirectoryIndex = [/^index\.[a-z]+$/];
  137. }
  138. if (Array.isArray(options.removeDirectoryIndex) && options.removeDirectoryIndex.length > 0) {
  139. let pathComponents = urlObject.pathname.split('/');
  140. const lastComponent = pathComponents[pathComponents.length - 1];
  141. if (testParameter(lastComponent, options.removeDirectoryIndex)) {
  142. pathComponents = pathComponents.slice(0, -1);
  143. urlObject.pathname = pathComponents.slice(1).join('/') + '/';
  144. }
  145. }
  146. if (urlObject.hostname) {
  147. // Remove trailing dot
  148. urlObject.hostname = urlObject.hostname.replace(/\.$/, '');
  149. // Remove `www.`
  150. if (options.stripWWW && /^www\.(?!www\.)[a-z\-\d]{1,63}\.[a-z.\-\d]{2,63}$/.test(urlObject.hostname)) {
  151. // Each label should be max 63 at length (min: 1).
  152. // Source: https://en.wikipedia.org/wiki/Hostname#Restrictions_on_valid_host_names
  153. // Each TLD should be up to 63 characters long (min: 2).
  154. // It is technically possible to have a single character TLD, but none currently exist.
  155. urlObject.hostname = urlObject.hostname.replace(/^www\./, '');
  156. }
  157. }
  158. // Remove query unwanted parameters
  159. if (Array.isArray(options.removeQueryParameters)) {
  160. // eslint-disable-next-line unicorn/no-useless-spread -- We are intentionally spreading to get a copy.
  161. for (const key of [...urlObject.searchParams.keys()]) {
  162. if (testParameter(key, options.removeQueryParameters)) {
  163. urlObject.searchParams.delete(key);
  164. }
  165. }
  166. }
  167. if (options.removeQueryParameters === true) {
  168. urlObject.search = '';
  169. }
  170. // Sort query parameters
  171. if (options.sortQueryParameters) {
  172. urlObject.searchParams.sort();
  173. // Calling `.sort()` encodes the search parameters, so we need to decode them again.
  174. try {
  175. urlObject.search = decodeURIComponent(urlObject.search);
  176. } catch {}
  177. }
  178. if (options.removeTrailingSlash) {
  179. urlObject.pathname = urlObject.pathname.replace(/\/$/, '');
  180. }
  181. const oldUrlString = urlString;
  182. // Take advantage of many of the Node `url` normalizations
  183. urlString = urlObject.toString();
  184. if (!options.removeSingleSlash && urlObject.pathname === '/' && !oldUrlString.endsWith('/') && urlObject.hash === '') {
  185. urlString = urlString.replace(/\/$/, '');
  186. }
  187. // Remove ending `/` unless removeSingleSlash is false
  188. if ((options.removeTrailingSlash || urlObject.pathname === '/') && urlObject.hash === '' && options.removeSingleSlash) {
  189. urlString = urlString.replace(/\/$/, '');
  190. }
  191. // Restore relative protocol, if applicable
  192. if (hasRelativeProtocol && !options.normalizeProtocol) {
  193. urlString = urlString.replace(/^http:\/\//, '//');
  194. }
  195. // Remove http/https
  196. if (options.stripProtocol) {
  197. urlString = urlString.replace(/^(?:https?:)?\/\//, '');
  198. }
  199. return urlString;
  200. }
  201. // Dependencies
  202. /**
  203. * parseUrl
  204. * Parses the input url.
  205. *
  206. * **Note**: This *throws* if invalid urls are provided.
  207. *
  208. * @name parseUrl
  209. * @function
  210. * @param {String} url The input url.
  211. * @param {Boolean|Object} normalize Whether to normalize the url or not.
  212. * Default is `false`. If `true`, the url will
  213. * be normalized. If an object, it will be the
  214. * options object sent to [`normalize-url`](https://github.com/sindresorhus/normalize-url).
  215. *
  216. * For SSH urls, normalize won't work.
  217. *
  218. * @return {Object} An object containing the following fields:
  219. *
  220. * - `protocols` (Array): An array with the url protocols (usually it has one element).
  221. * - `protocol` (String): The first protocol, `"ssh"` (if the url is a ssh url) or `"file"`.
  222. * - `port` (null|Number): The domain port.
  223. * - `resource` (String): The url domain (including subdomains).
  224. * - `user` (String): The authentication user (usually for ssh urls).
  225. * - `pathname` (String): The url pathname.
  226. * - `hash` (String): The url hash.
  227. * - `search` (String): The url querystring value.
  228. * - `href` (String): The input url.
  229. * - `query` (Object): The url querystring, parsed as object.
  230. * - `parse_failed` (Boolean): Whether the parsing failed or not.
  231. */
  232. const parseUrl = (url, normalize = false) => {
  233. // Constants
  234. const GIT_RE = /^(?:([a-z_][a-z0-9_-]{0,31})@|https?:\/\/)([\w\.\-@]+)[\/:]([\~,\.\w,\-,\_,\/]+?(?:\.git|\/)?)$/;
  235. const throwErr = msg => {
  236. const err = new Error(msg);
  237. err.subject_url = url;
  238. throw err
  239. };
  240. if (typeof url !== "string" || !url.trim()) {
  241. throwErr("Invalid url.");
  242. }
  243. if (url.length > parseUrl.MAX_INPUT_LENGTH) {
  244. throwErr("Input exceeds maximum length. If needed, change the value of parseUrl.MAX_INPUT_LENGTH.");
  245. }
  246. if (normalize) {
  247. if (typeof normalize !== "object") {
  248. normalize = {
  249. stripHash: false
  250. };
  251. }
  252. url = normalizeUrl(url, normalize);
  253. }
  254. const parsed = parsePath__default["default"](url);
  255. // Potential git-ssh urls
  256. if (parsed.parse_failed) {
  257. const matched = parsed.href.match(GIT_RE);
  258. if (matched) {
  259. parsed.protocols = ["ssh"];
  260. parsed.protocol = "ssh";
  261. parsed.resource = matched[2];
  262. parsed.host = matched[2];
  263. parsed.user = matched[1];
  264. parsed.pathname = `/${matched[3]}`;
  265. parsed.parse_failed = false;
  266. } else {
  267. throwErr("URL parsing failed.");
  268. }
  269. }
  270. return parsed;
  271. };
  272. parseUrl.MAX_INPUT_LENGTH = 2048;
  273. module.exports = parseUrl;