llstring.cpp 35 KB


  1. /**
  2. * @file llstring.cpp
  3. * @brief String utility functions and the std::string class.
  4. *
  5. * $LicenseInfo:firstyear=2001&license=viewergpl$
  6. *
  7. * Copyright (c) 2001-2009, Linden Research, Inc.
  8. *
  9. * Second Life Viewer Source Code
  10. * The source code in this file ("Source Code") is provided by Linden Lab
  11. * to you under the terms of the GNU General Public License, version 2.0
  12. * ("GPL"), unless you have obtained a separate licensing agreement
  13. * ("Other License"), formally executed by you and Linden Lab. Terms of
  14. * the GPL can be found in doc/GPL-license.txt in this distribution, or
  15. * online at http://secondlifegrid.net/programs/open_source/licensing/gplv2
  16. *
  17. * There are special exceptions to the terms and conditions of the GPL as
  18. * it is applied to this Source Code. View the full text of the exception
  19. * in the file doc/FLOSS-exception.txt in this software distribution, or
  20. * online at
  21. * http://secondlifegrid.net/programs/open_source/licensing/flossexception
  22. *
  23. * By copying, modifying or distributing this software, you acknowledge
  24. * that you have read and understood your obligations described above,
  25. * and agree to abide by those obligations.
  26. *
  27. * ALL LINDEN LAB SOURCE CODE IS PROVIDED "AS IS." LINDEN LAB MAKES NO
  28. * WARRANTIES, EXPRESS, IMPLIED OR OTHERWISE, REGARDING ITS ACCURACY,
  29. * COMPLETENESS OR PERFORMANCE.
  30. * $/LicenseInfo$
  31. */
  32. #include "linden_common.h"
  33. #include <cstdarg>
  34. #if LL_WINDOWS
  35. # include <winnls.h> // For WideCharToMultiByte
  36. # include <vector>
  37. #endif
  38. #include "llsd.h"
  39. #include "llstring.h"
  40. U8 hex_as_nybble(char hex)
  41. {
  42. if (hex >= '0' && hex <= '9')
  43. {
  44. return (U8)(hex - '0');
  45. }
  46. else if (hex >= 'a' && hex <='f')
  47. {
  48. return (U8)(10 + hex - 'a');
  49. }
  50. else if (hex >= 'A' && hex <='F')
  51. {
  52. return (U8)(10 + hex - 'A');
  53. }
  54. return 0; // uh - oh, not hex any more...
  55. }
  56. // See http://www.unicode.org/Public/BETA/CVTUTF-1-2/ConvertUTF.c
  57. // for the Unicode implementation - this doesn't match because it was written
  58. // before finding it.
  59. std::ostream& operator<<(std::ostream& s, const LLWString& wstr)
  60. {
  61. std::string utf8_str = wstring_to_utf8str(wstr);
  62. s << utf8_str;
  63. return s;
  64. }
  65. std::ptrdiff_t wchar_to_utf8chars(llwchar in_char, char* outchars)
  66. {
  67. U32 cur_char = (U32)in_char;
  68. char* base = outchars;
  69. if (cur_char < 0x80)
  70. {
  71. *outchars++ = (U8)cur_char;
  72. }
  73. else if (cur_char < 0x800)
  74. {
  75. *outchars++ = 0xC0 | (cur_char >> 6);
  76. *outchars++ = 0x80 | (cur_char & 0x3F);
  77. }
  78. else if (cur_char < 0x10000)
  79. {
  80. *outchars++ = 0xE0 | (cur_char >> 12);
  81. *outchars++ = 0x80 | ((cur_char >> 6) & 0x3F);
  82. *outchars++ = 0x80 | (cur_char & 0x3F);
  83. }
  84. else if (cur_char < 0x200000)
  85. {
  86. *outchars++ = 0xF0 | (cur_char >> 18);
  87. *outchars++ = 0x80 | ((cur_char >> 12) & 0x3F);
  88. *outchars++ = 0x80 | ((cur_char >> 6) & 0x3F);
  89. *outchars++ = 0x80 | (cur_char & 0x3F);
  90. }
  91. else if (cur_char < 0x4000000)
  92. {
  93. *outchars++ = 0xF8 | (cur_char >> 24);
  94. *outchars++ = 0x80 | ((cur_char >> 18) & 0x3F);
  95. *outchars++ = 0x80 | ((cur_char >> 12) & 0x3F);
  96. *outchars++ = 0x80 | ((cur_char >> 6) & 0x3F);
  97. *outchars++ = 0x80 | (cur_char & 0x3F);
  98. }
  99. else if (cur_char < 0x80000000)
  100. {
  101. *outchars++ = 0xFC | (cur_char >> 30);
  102. *outchars++ = 0x80 | ((cur_char >> 24) & 0x3F);
  103. *outchars++ = 0x80 | ((cur_char >> 18) & 0x3F);
  104. *outchars++ = 0x80 | ((cur_char >> 12) & 0x3F);
  105. *outchars++ = 0x80 | ((cur_char >> 6) & 0x3F);
  106. *outchars++ = 0x80 | (cur_char & 0x3F);
  107. }
  108. else
  109. {
  110. llwarns << "Invalid Unicode character " << cur_char << "!" << llendl;
  111. *outchars++ = LL_UNKNOWN_CHAR;
  112. }
  113. return outchars - base;
  114. }
  115. static std::ptrdiff_t utf16chars_to_wchar(const U16* inchars, llwchar* outchar)
  116. {
  117. const U16* base = inchars;
  118. U16 cur_char = *inchars++;
  119. llwchar char32 = cur_char;
  120. if ((cur_char >= 0xD800) && (cur_char <= 0xDFFF))
  121. {
  122. // Surrogates
  123. char32 = ((llwchar)(cur_char - 0xD800)) << 10;
  124. cur_char = *inchars++;
  125. char32 += (llwchar)(cur_char - 0xDC00) + 0x0010000UL;
  126. }
  127. else
  128. {
  129. char32 = (llwchar)cur_char;
  130. }
  131. *outchar = char32;
  132. return std::ptrdiff_t(inchars - base);
  133. }
  134. llutf16string wstring_to_utf16str(const LLWString& utf32str, S32 len)
  135. {
  136. llutf16string out;
  137. S32 i = 0;
  138. while (i < len)
  139. {
  140. U32 cur_char = utf32str[i++];
  141. if (cur_char > 0xFFFF)
  142. {
  143. out += (0xD7C0 + (cur_char >> 10));
  144. out += (0xDC00 | (cur_char & 0x3FF));
  145. }
  146. else
  147. {
  148. out += cur_char;
  149. }
  150. }
  151. return out;
  152. }
  153. LLWString utf16str_to_wstring(const llutf16string& utf16str, S32 len)
  154. {
  155. LLWString wout;
  156. if (len <= 0 || utf16str.empty()) return wout;
  157. S32 i = 0;
  158. // craziness to make gcc happy (llutf16string.c_str() is tweaked on linux):
  159. const U16* chars16 = &(*(utf16str.begin()));
  160. while (i < len)
  161. {
  162. llwchar cur_char;
  163. i += utf16chars_to_wchar(chars16 + i, &cur_char);
  164. wout += cur_char;
  165. }
  166. return wout;
  167. }
  168. // Length in llwchar (UTF-32) of the first len units (16 bits) of the given
  169. // UTF-16 string.
  170. S32 utf16str_wstring_length(const llutf16string& utf16str, S32 utf16_len)
  171. {
  172. S32 surrogate_pairs = 0;
  173. // ... craziness to make gcc happy (llutf16string.c_str() is tweaked on
  174. // linux):
  175. const U16* const utf16_chars = &(*(utf16str.begin()));
  176. S32 i = 0;
  177. while (i < utf16_len)
  178. {
  179. const U16 c = utf16_chars[i++];
  180. if (c >= 0xD800 && c <= 0xDBFF) // See http://en.wikipedia.org/wiki/UTF-16
  181. {
  182. // Have first byte of a surrogate pair
  183. if (i >= utf16_len)
  184. {
  185. break;
  186. }
  187. const U16 d = utf16_chars[i];
  188. if (d >= 0xDC00 && d <= 0xDFFF)
  189. { // Have valid second byte of a surrogate pair
  190. ++surrogate_pairs;
  191. ++i;
  192. }
  193. }
  194. }
  195. return utf16_len - surrogate_pairs;
  196. }
  197. // Length in utf16string (UTF-16) of wlen wchars beginning at woffset.
  198. S32 wstring_utf16_length(const LLWString& wstr, S32 woffset, S32 wlen)
  199. {
  200. const S32 end = llmin((S32)wstr.length(), woffset + wlen);
  201. if (end < woffset)
  202. {
  203. return 0;
  204. }
  205. else
  206. {
  207. S32 length = end - woffset;
  208. for (S32 i = woffset; i < end; ++i)
  209. {
  210. if (wstr[i] >= 0x10000)
  211. {
  212. ++length;
  213. }
  214. }
  215. return length;
  216. }
  217. }
  218. // Given a wstring and an offset in it, returns the length as wstring (i.e.,
  219. // number of llwchars) of the longest substring that starts at the offset
  220. // and whose equivalent utf-16 string does not exceeds the given utf16_length.
  221. S32 wstring_length_from_utf16_length(const LLWString& wstr, S32 woffset,
  222. S32 utf16_length, bool* unaligned)
  223. {
  224. const S32 end = wstr.length();
  225. bool u = false;
  226. S32 n = woffset + utf16_length;
  227. S32 i = woffset;
  228. while (i < end)
  229. {
  230. if (wstr[i] >= 0x10000)
  231. {
  232. --n;
  233. }
  234. if (i >= n)
  235. {
  236. u = (i > n);
  237. break;
  238. }
  239. ++i;
  240. }
  241. if (unaligned)
  242. {
  243. *unaligned = u;
  244. }
  245. return i - woffset;
  246. }
  247. S32 wchar_utf8_length(const llwchar wc)
  248. {
  249. if (wc < 0x80)
  250. {
  251. // This case will also catch negative values which are
  252. // technically invalid.
  253. return 1;
  254. }
  255. else if (wc < 0x800)
  256. {
  257. return 2;
  258. }
  259. else if (wc < 0x10000)
  260. {
  261. return 3;
  262. }
  263. else if (wc < 0x200000)
  264. {
  265. return 4;
  266. }
  267. else if (wc < 0x4000000)
  268. {
  269. return 5;
  270. }
  271. else
  272. {
  273. return 6;
  274. }
  275. }
  276. S32 wstring_utf8_length(const LLWString& wstr)
  277. {
  278. S32 len = 0;
  279. for (S32 i = 0, count = wstr.length(); i < count; ++i)
  280. {
  281. len += wchar_utf8_length(wstr[i]);
  282. }
  283. return len;
  284. }
  285. LLWString utf8str_to_wstring(const std::string& utf8str, S32 len)
  286. {
  287. LLWString wout;
  288. S32 max_len = utf8str.length();
  289. S32 i = 0;
  290. while (i < len)
  291. {
  292. llwchar unichar;
  293. U8 cur_char = utf8str[i];
  294. if (cur_char < 0x80)
  295. {
  296. // Ascii character, just add it
  297. unichar = cur_char;
  298. }
  299. else
  300. {
  301. S32 cont_bytes = 0;
  302. if ((cur_char >> 5) == 0x6) // Two byte UTF8 -> 1 UTF32
  303. {
  304. unichar = 0x1F & cur_char;
  305. cont_bytes = 1;
  306. }
  307. else if ((cur_char >> 4) == 0xe) // Three byte UTF8 -> 1 UTF32
  308. {
  309. unichar = 0x0F & cur_char;
  310. cont_bytes = 2;
  311. }
  312. else if ((cur_char >> 3) == 0x1e) // Four byte UTF8 -> 1 UTF32
  313. {
  314. unichar = 0x07 & cur_char;
  315. cont_bytes = 3;
  316. }
  317. else if ((cur_char >> 2) == 0x3e) // Five byte UTF8 -> 1 UTF32
  318. {
  319. unichar = 0x03 & cur_char;
  320. cont_bytes = 4;
  321. }
  322. else if ((cur_char >> 1) == 0x7e) // Six byte UTF8 -> 1 UTF32
  323. {
  324. unichar = 0x01 & cur_char;
  325. cont_bytes = 5;
  326. }
  327. else
  328. {
  329. wout += LL_UNKNOWN_CHAR;
  330. ++i;
  331. continue;
  332. }
  333. // Check that this character doesn't go past the end of the string
  334. S32 end = len < i + cont_bytes ? len : i + cont_bytes;
  335. do
  336. {
  337. if (++i >= max_len)
  338. {
  339. // Malformed sequence - roll back to look at this as a new
  340. // char
  341. unichar = LL_UNKNOWN_CHAR;
  342. --i;
  343. break;
  344. }
  345. cur_char = utf8str[i];
  346. if ((cur_char >> 6) == 0x2)
  347. {
  348. unichar <<= 6;
  349. unichar += 0x3F & cur_char;
  350. }
  351. else
  352. {
  353. // Malformed sequence - roll back to look at this as a new
  354. // char
  355. unichar = LL_UNKNOWN_CHAR;
  356. --i;
  357. break;
  358. }
  359. }
  360. while (i < end);
  361. // Handle overlong characters and NULL characters
  362. if ((cont_bytes == 1 && unichar < 0x80) ||
  363. (cont_bytes == 2 && unichar < 0x800) ||
  364. (cont_bytes == 3 && unichar < 0x10000) ||
  365. (cont_bytes == 4 && unichar < 0x200000) ||
  366. (cont_bytes == 5 && unichar < 0x4000000))
  367. {
  368. unichar = LL_UNKNOWN_CHAR;
  369. }
  370. }
  371. wout += unichar;
  372. ++i;
  373. }
  374. return wout;
  375. }
  376. std::string wstring_to_utf8str(const LLWString& utf32str, S32 len)
  377. {
  378. std::string out;
  379. S32 i = 0;
  380. while (i < len)
  381. {
  382. char tchars[8];
  383. S32 n = wchar_to_utf8chars(utf32str[i++], tchars);
  384. tchars[n] = 0;
  385. out += tchars;
  386. }
  387. return out;
  388. }
  389. std::string utf8str_trim(const std::string& utf8str)
  390. {
  391. LLWString wstr = utf8str_to_wstring(utf8str);
  392. LLWStringUtil::trim(wstr);
  393. return wstring_to_utf8str(wstr);
  394. }
  395. std::string utf8str_tolower(const std::string& utf8str)
  396. {
  397. LLWString out_str = utf8str_to_wstring(utf8str);
  398. LLWStringUtil::toLower(out_str);
  399. return wstring_to_utf8str(out_str);
  400. }
  401. S32 utf8str_compare_insensitive(const std::string& lhs, const std::string& rhs)
  402. {
  403. LLWString wlhs = utf8str_to_wstring(lhs);
  404. LLWString wrhs = utf8str_to_wstring(rhs);
  405. return LLWStringUtil::compareInsensitive(wlhs, wrhs);
  406. }
  407. std::string utf8str_truncate(const std::string& utf8str, S32 max_len)
  408. {
  409. if (!max_len)
  410. {
  411. return std::string();
  412. }
  413. if ((S32)utf8str.length() <= max_len)
  414. {
  415. return utf8str;
  416. }
  417. else
  418. {
  419. S32 cur_char = max_len;
  420. // If we are ASCII, we do not need to do anything
  421. if ((U8)utf8str[cur_char] > 0x7f)
  422. {
  423. // If first two bits are (10), it is the tail end of a multibyte
  424. // char. We need to shift back to the first character
  425. while ((0xc0 & utf8str[cur_char]) == 0x80)
  426. {
  427. // Keep moving forward until we hit the first char;
  428. if (--cur_char == 0)
  429. {
  430. // Make sure we do not trash memory if we've got a bogus
  431. // string.
  432. break;
  433. }
  434. }
  435. }
  436. // The byte index we are on is one we want to get rid of, so we only
  437. // want to copy up to (cur_char-1) chars
  438. return utf8str.substr(0, cur_char);
  439. }
  440. }
  441. std::string utf8str_substChar(const std::string& utf8str,
  442. const llwchar target_char,
  443. const llwchar replace_char)
  444. {
  445. LLWString wstr = utf8str_to_wstring(utf8str);
  446. LLWStringUtil::replaceChar(wstr, target_char, replace_char);
  447. //wstr = wstring_substChar(wstr, target_char, replace_char);
  448. return wstring_to_utf8str(wstr);
  449. }
  450. std::string utf8str_makeASCII(const std::string& utf8str)
  451. {
  452. LLWString wstr = utf8str_to_wstring(utf8str);
  453. LLWStringUtil::_makeASCII(wstr);
  454. return wstring_to_utf8str(wstr);
  455. }
  456. std::string mbcsstring_makeASCII(const std::string& wstr)
  457. {
  458. // Replace non-ASCII chars with replace_char
  459. std::string out_str = wstr;
  460. for (S32 i = 0, len = out_str.length(); i < len; ++i)
  461. {
  462. if ((U8)out_str[i] > 0x7f)
  463. {
  464. out_str[i] = LL_UNKNOWN_CHAR;
  465. }
  466. }
  467. return out_str;
  468. }
  469. std::string utf8str_removeCRLF(const std::string& utf8str)
  470. {
  471. std::string out;
  472. size_t len = utf8str.length();
  473. if (!len) return out;
  474. out.reserve(len);
  475. for (size_t i = 0; i < len; ++i)
  476. {
  477. unsigned char c = utf8str[i];
  478. if (c != 13)
  479. {
  480. out.push_back(c);
  481. }
  482. }
  483. return out;
  484. }
  485. std::string iso8859_to_utf8(const std::string& iso8859str)
  486. {
  487. std::string out;
  488. size_t len = iso8859str.length();
  489. if (!len) return out;
  490. out.reserve(2 * len);
  491. for (size_t i = 0; i < len; ++i)
  492. {
  493. unsigned char c = iso8859str[i];
  494. if (c < 128)
  495. {
  496. out.push_back(c);
  497. }
  498. else
  499. {
  500. out.push_back(0xc2 + (c > 0xbf));
  501. out.push_back(0x80 + (c & 0x3f));
  502. }
  503. }
  504. return out;
  505. }
  506. std::string utf8_to_iso8859(const std::string& utf8str)
  507. {
  508. std::string out;
  509. size_t len = utf8str.length();
  510. if (!len) return out;
  511. out.reserve(len);
  512. for (size_t i = 0; i < len; ++i)
  513. {
  514. unsigned char c = utf8str[i];
  515. if (c < 128)
  516. {
  517. out.push_back(c);
  518. }
  519. else if (i < len - 1)
  520. {
  521. out.push_back(((c & 0x1f) << 6) + (utf8str[++i] & 0x3f));
  522. }
  523. }
  524. return out;
  525. }
  526. #if LL_WINDOWS
  527. std::string ll_convert_wide_to_string(const wchar_t* in,
  528. unsigned int code_page)
  529. {
  530. std::string out;
  531. if (in)
  532. {
  533. int len_in = wcslen(in);
  534. int len_out = WideCharToMultiByte(code_page, 0, in, len_in, NULL, 0, 0,
  535. 0);
  536. // We will need two more bytes for the double NULL ending created in
  537. // WideCharToMultiByte().
  538. char* pout = new char[len_out + 2];
  539. memset(pout, 0, len_out + 2);
  540. if (pout)
  541. {
  542. WideCharToMultiByte(code_page, 0, in, len_in, pout, len_out, 0, 0);
  543. out.assign(pout);
  544. delete[] pout;
  545. }
  546. }
  547. return out;
  548. }
  549. std::string ll_convert_wide_to_string(const wchar_t* in)
  550. {
  551. return ll_convert_wide_to_string(in, CP_UTF8);
  552. }
  553. LLWString ll_convert_wide_to_wstring(const std::wstring& in)
  554. {
  555. // This function, like its converse, is a placeholder, encapsulating a
  556. // guilty little hack: the only "official" way Nat has found to convert
  557. // between std::wstring (16 bits on Windows) and LLWString (UTF-32) is
  558. // by using iconv, which we have avoided so far. It sorts of works to just
  559. // copy individual characters...
  560. // The point is that if/when we DO introduce some more official way to
  561. // perform such conversions, we should only have to call it here.
  562. return { in.begin(), in.end() };
  563. }
  564. std::wstring ll_convert_wstring_to_wide(const LLWString& in)
  565. {
  566. // See comments in ll_convert_wide_to_wstring()
  567. return { in.begin(), in.end() };
  568. }
  569. std::basic_string<wchar_t> ll_convert_string_to_wide(const std::string& in)
  570. {
  571. return ll_convert_string_to_wide(in, CP_UTF8);
  572. }
  573. std::basic_string<wchar_t> ll_convert_string_to_wide(const std::string& in,
  574. unsigned int code_page)
  575. {
  576. // From review:
  577. // We can preallocate a wide char buffer that is the same length (in
  578. // wchar_t elements) as the utf8 input, plus one for a nul terminator, and
  579. // be guaranteed to not overflow.
  580. // Normally, I would call that sort of thing premature optimization, but we
  581. // *are* seeing string operations taking a bunch of time, especially when
  582. // constructing widgets.
  583. //int output_str_len = MultiByteToWideChar(code_page, 0, in.c_str(),
  584. // in.length(), NULL, 0);
  585. // Reserve an output buffer that will be destroyed on exit, with a place to
  586. // put a NUL terminator.
  587. std::vector<wchar_t> w_out(in.length() + 1);
  588. size_t len = w_out.size();
  589. memset(&w_out[0], 0, len);
  590. int real_output_str_len = MultiByteToWideChar(code_page, 0, in.c_str(),
  591. in.length(), &w_out[0], len);
  592. // Looks like MultiByteToWideChar didn't add null terminator to converted
  593. // string, see EXT-4858.
  594. w_out[real_output_str_len] = 0;
  595. // Construct string<wchar_t> from our temporary output buffer
  596. return {&w_out[0]};
  597. }
  598. std::string ll_convert_string_to_utf8_string(const std::string& in)
  599. {
  600. auto w_mesg = ll_convert_string_to_wide(in, CP_ACP);
  601. std::string out_utf8(ll_convert_wide_to_string(w_mesg.c_str(), CP_UTF8));
  602. return out_utf8;
  603. }
  604. #endif // LL_WINDOWS
  605. ///////////////////////////////////////////////////////////////////////////////
  606. // Formerly in u64.cpp - Utilities for conversions between U64 and string
  607. ///////////////////////////////////////////////////////////////////////////////
  608. U64 str_to_U64(const std::string& str)
  609. {
  610. U64 result = 0;
  611. const char* aptr = strpbrk(str.c_str(), "0123456789");
  612. if (!aptr)
  613. {
  614. llwarns << "Bad string to U64 conversion attempt: format" << llendl;
  615. }
  616. else
  617. {
  618. while (*aptr >= '0' && *aptr <= '9')
  619. {
  620. result = result * 10 + (*aptr++ - '0');
  621. }
  622. }
  623. return result;
  624. }
  625. std::string U64_to_str(U64 value)
  626. {
  627. std::string res;
  628. U32 part1, part2, part3;
  629. part3 = (U32)(value % (U64)10000000);
  630. value /= 10000000;
  631. part2 = (U32)(value % (U64)10000000);
  632. value /= 10000000;
  633. part1 = (U32)(value % (U64)10000000);
  634. // Three cases to avoid leading zeroes unless necessary
  635. if (part1)
  636. {
  637. res = llformat("%u%07u%07u", part1, part2, part3);
  638. }
  639. else if (part2)
  640. {
  641. res = llformat("%u%07u", part2, part3);
  642. }
  643. else
  644. {
  645. res = llformat("%u", part3);
  646. }
  647. return res;
  648. }
  649. char* U64_to_str(U64 value, char* result, S32 result_size)
  650. {
  651. std::string res = U64_to_str(value);
  652. LLStringUtil::copy(result, res.c_str(), result_size);
  653. return result;
  654. }
  655. U64 llstrtou64(const char* str, char** end, S32 base)
  656. {
  657. #ifdef LL_WINDOWS
  658. return _strtoui64(str, end, base);
  659. #else
  660. return strtoull(str, end, base);
  661. #endif
  662. }
  663. ///////////////////////////////////////////////////////////////////////////////
  664. // LLStringOps class
  665. ///////////////////////////////////////////////////////////////////////////////
  666. long LLStringOps::sPacificTimeOffset = 0;
  667. long LLStringOps::sLocalTimeOffset = 0;
  668. bool LLStringOps::sPacificDaylightTime = 0;
  669. std::map<std::string, std::string> LLStringOps::datetimeToCodes;
  670. std::vector<std::string> LLStringOps::sWeekDayList;
  671. std::vector<std::string> LLStringOps::sWeekDayShortList;
  672. std::vector<std::string> LLStringOps::sMonthList;
  673. std::vector<std::string> LLStringOps::sMonthShortList;
  674. std::string LLStringOps::sDayFormat;
  675. std::string LLStringOps::sAM;
  676. std::string LLStringOps::sPM;
  677. //static
  678. bool LLStringOps::isHexString(const std::string& str)
  679. {
  680. const char* buf = str.c_str();
  681. int len = str.size();
  682. while (--len >= 0)
  683. {
  684. if (!isxdigit(buf[len])) return false;
  685. }
  686. return true;
  687. }
  688. //static
  689. bool LLStringOps::isEmoji(llwchar a)
  690. {
  691. #if 0 // Do not consider special characters that might have a corresponding
  692. // glyph in the monochorme fallback fonts as a "genuine" emoji. HB
  693. return a == 0xa9 || a == 0xae || (a >= 0x2000 && a < 0x3300) ||
  694. (a >= 0x1f000 && a < 0x20000);
  695. #else
  696. // These are indeed "genuine" emojis. HB
  697. return a >= 0x1f000 && a < 0x20000;
  698. #endif
  699. }
  700. //static
  701. S32 LLStringOps::collate(const llwchar* a, const llwchar* b)
  702. {
  703. #if LL_WINDOWS
  704. // Under Windows, wide string functions operator on 16-bit strings, not the
  705. // proper 32 bit wide string.
  706. return strcmp(wstring_to_utf8str(LLWString(a)).c_str(),
  707. wstring_to_utf8str(LLWString(b)).c_str());
  708. #else
  709. return wcscoll(a, b);
  710. #endif
  711. }
  712. //static
  713. void LLStringOps::setupDatetimeInfo(bool daylight)
  714. {
  715. time_t nowT, localT, gmtT;
  716. struct tm * tmpT;
  717. nowT = time (NULL);
  718. tmpT = gmtime(&nowT);
  719. gmtT = mktime (tmpT);
  720. tmpT = localtime (&nowT);
  721. localT = mktime (tmpT);
  722. sLocalTimeOffset = (long) (gmtT - localT);
  723. if (tmpT->tm_isdst)
  724. {
  725. sLocalTimeOffset -= 60 * 60; // 1 hour
  726. }
  727. sPacificDaylightTime = daylight;
  728. sPacificTimeOffset = (sPacificDaylightTime? 7 : 8) * 60 * 60;
  729. datetimeToCodes["wkday"] = "%a"; // Thu
  730. datetimeToCodes["weekday"] = "%A"; // Thursday
  731. datetimeToCodes["year4"] = "%Y"; // 2009
  732. datetimeToCodes["year"] = "%Y"; // 2009
  733. datetimeToCodes["year2"] = "%y"; // 09
  734. datetimeToCodes["mth"] = "%b"; // Aug
  735. datetimeToCodes["month"] = "%B"; // August
  736. datetimeToCodes["mthnum"] = "%m"; // 08
  737. datetimeToCodes["day"] = "%d"; // 31
  738. datetimeToCodes["sday"] = "%-d"; // 9
  739. datetimeToCodes["hour24"] = "%H"; // 14
  740. datetimeToCodes["hour"] = "%H"; // 14
  741. datetimeToCodes["hour12"] = "%I"; // 02
  742. datetimeToCodes["min"] = "%M"; // 59
  743. datetimeToCodes["ampm"] = "%p"; // AM
  744. datetimeToCodes["second"] = "%S"; // 59
  745. datetimeToCodes["timezone"] = "%Z"; // PST
  746. }
  747. static void tokenize_str_to_aray(const std::string& data,
  748. std::vector<std::string>& output)
  749. {
  750. output.clear();
  751. size_t length = data.size();
  752. // Tokenize it and put it in the array
  753. std::string cur_word;
  754. for (size_t i = 0; i < length; ++i)
  755. {
  756. if (data[i] == ':')
  757. {
  758. output.push_back(cur_word);
  759. cur_word.clear();
  760. }
  761. else
  762. {
  763. cur_word.append(1, data[i]);
  764. }
  765. }
  766. output.push_back(cur_word);
  767. }
  768. //static
  769. void LLStringOps::setupWeekDaysNames(const std::string& data)
  770. {
  771. tokenize_str_to_aray(data, sWeekDayList);
  772. }
  773. //static
  774. void LLStringOps::setupWeekDaysShortNames(const std::string& data)
  775. {
  776. tokenize_str_to_aray(data, sWeekDayShortList);
  777. }
  778. //static
  779. void LLStringOps::setupMonthNames(const std::string& data)
  780. {
  781. tokenize_str_to_aray(data, sMonthList);
  782. }
  783. //static
  784. void LLStringOps::setupMonthShortNames(const std::string& data)
  785. {
  786. tokenize_str_to_aray(data, sMonthShortList);
  787. }
  788. //static
  789. void LLStringOps::setupDayFormat(const std::string& data)
  790. {
  791. sDayFormat = data;
  792. }
  793. //static
  794. std::string LLStringOps::getDatetimeCode(std::string key)
  795. {
  796. std::map<std::string, std::string>::iterator iter;
  797. iter = datetimeToCodes.find (key);
  798. if (iter != datetimeToCodes.end())
  799. {
  800. return iter->second;
  801. }
  802. return std::string("");
  803. }
  804. namespace LLStringFn
  805. {
  806. // Note: this restricts output to ASCII
  807. void replace_nonprintable_in_ascii(std::basic_string<char>& str,
  808. char replacement)
  809. {
  810. constexpr char SPACE = 0x20;
  811. for (size_t i = 0, len = str.size(); i < len; ++i)
  812. {
  813. if (str[i] < SPACE)
  814. {
  815. str[i] = replacement;
  816. }
  817. }
  818. }
  819. // Note: this restricts output to ASCII
  820. void replace_nonprintable_and_pipe_in_ascii(std::basic_string<char>& str,
  821. char replacement)
  822. {
  823. constexpr char SPACE = 0x20;
  824. constexpr char PIPE = 0x7c;
  825. for (size_t i = 0, len = str.size(); i < len; ++i)
  826. {
  827. if (str[i] < SPACE || str[i] == PIPE)
  828. {
  829. str[i] = replacement;
  830. }
  831. }
  832. }
  833. // Replaces all control characters (c < 0x20) with replacement in string.
  834. void replace_ascii_controlchars(std::basic_string<char>& str,
  835. char replacement)
  836. {
  837. constexpr unsigned char SPACE = 0x20;
  838. for (size_t i = 0, len = str.size(); i < len; ++i)
  839. {
  840. const unsigned char c = (unsigned char)str[i];
  841. if (c < SPACE)
  842. {
  843. str[i] = replacement;
  844. }
  845. }
  846. }
  847. // https://wiki.lindenlab.com/wiki/Unicode_Guidelines has details on
  848. // allowable code points for XML. Specifically, they are:
  849. // 0x09, 0x0a, 0x0d, and 0x20 on up. JC
  850. std::string strip_invalid_xml(const std::string& str)
  851. {
  852. constexpr unsigned char SPACE = 0x20;
  853. constexpr unsigned char TAB = 0x09;
  854. constexpr unsigned char LF = 0x0a;
  855. constexpr unsigned char CR = 0x0d;
  856. std::string output;
  857. output.reserve(str.size());
  858. std::string::const_iterator it = str.begin();
  859. while (it != str.end())
  860. {
  861. // Must compare as unsigned for >=
  862. // Test most likely match first
  863. const unsigned char c = (unsigned char)*it;
  864. if (c >= SPACE || c == TAB || c == LF || c == CR)
  865. {
  866. output.push_back(c);
  867. }
  868. ++it;
  869. }
  870. return output;
  871. }
  872. typedef std::map<char, std::string> literals_map_t;
  873. static const literals_map_t xml_elem_literals =
  874. {
  875. { '<', "&lt;" },
  876. { '>', "&gt;" },
  877. { '&', "&amp;" }
  878. };
  879. static const literals_map_t xml_attr_literals =
  880. {
  881. { '"', "&quot;" },
  882. { '\'', "&apos;" }
  883. };
  884. static void literals_encode(std::string& text,
  885. const literals_map_t& literals)
  886. {
  887. for (literals_map_t::const_iterator it = literals.begin(),
  888. end = literals.end();
  889. it != end; ++it)
  890. {
  891. size_t pos = 0;
  892. while ((pos = text.find(it->first, pos)) != std::string::npos)
  893. {
  894. text.replace(pos, 1, it->second);
  895. pos += it->second.size();
  896. }
  897. }
  898. }
  899. // Replaces all characters that are not allowed in XML 1.0 with the
  900. // corresponding literals.
  901. std::string xml_encode(const std::string& input, bool for_attribute)
  902. {
  903. std::string result(input);
  904. literals_encode(result, xml_elem_literals);
  905. if (for_attribute)
  906. {
  907. literals_encode(result, xml_attr_literals);
  908. }
  909. return result;
  910. }
  911. static void literals_decode(std::string& text,
  912. const literals_map_t& literals)
  913. {
  914. for (literals_map_t::const_iterator it = literals.begin(),
  915. end = literals.end();
  916. it != end; ++it)
  917. {
  918. size_t pos = 0;
  919. while ((pos = text.find(it->second, pos)) != std::string::npos)
  920. {
  921. text[pos++] = it->first;
  922. text.erase(pos, it->second.size() - 1);
  923. }
  924. }
  925. }
  926. // Replaces some of XML literals that are defined in XML 1.0 with the
  927. // corresponding characters.
  928. std::string xml_decode(const std::string& input, bool for_attribute)
  929. {
  930. std::string result(input);
  931. literals_decode(result, xml_elem_literals);
  932. if (for_attribute)
  933. {
  934. literals_decode(result, xml_attr_literals);
  935. }
  936. return result;
  937. }
  938. }
  939. ////////////////////////////////////////////////////////////
  940. // Forward specialization of LLStringUtil::format before use in
  941. // LLStringUtil::formatDatetime.
  942. template<>
  943. S32 LLStringUtil::format(std::string& s, const format_map_t& substitutions);
  944. //static
  945. template<>
  946. void LLStringUtil::getTokens(const std::string& instr,
  947. std::vector<std::string>& tokens,
  948. const std::string& delims)
  949. {
  950. std::string token;
  951. size_t start = instr.find_first_not_of(delims);
  952. while (start != std::string::npos)
  953. {
  954. size_t end = instr.find_first_of(delims, start);
  955. if (end == std::string::npos)
  956. {
  957. end = instr.length();
  958. }
  959. token = instr.substr(start, end - start);
  960. LLStringUtil::trim(token);
  961. tokens.push_back(token);
  962. start = instr.find_first_not_of(delims, end);
  963. }
  964. }
  965. //static
  966. template<>
  967. LLStringUtil::size_type LLStringUtil::getSubstitution(const std::string& instr,
  968. size_type& start,
  969. std::vector<std::string>& tokens)
  970. {
  971. const std::string delims (",");
  972. // Find the first [
  973. size_type pos1 = instr.find('[', start);
  974. if (pos1 == std::string::npos)
  975. return std::string::npos;
  976. //Find the first ] after the initial [
  977. size_type pos2 = instr.find(']', pos1);
  978. if (pos2 == std::string::npos)
  979. return std::string::npos;
  980. // Find the last [ before ] in case of nested [[]]
  981. pos1 = instr.find_last_of('[', pos2 - 1);
  982. if (pos1 == std::string::npos || pos1 < start)
  983. return std::string::npos;
  984. getTokens(std::string(instr ,pos1 + 1, pos2 - pos1 - 1), tokens, delims);
  985. start = pos2 + 1;
  986. return pos1;
  987. }
  988. //static
  989. template<>
  990. bool LLStringUtil::simpleReplacement(std::string& replacement,
  991. const std::string& token,
  992. const format_map_t& substitutions)
  993. {
  994. // See if we have a replacement for the bracketed string (without the
  995. // brackets) test first using has() because if we just look up with
  996. // operator[] we get back an empty string even if the value is missing.
  997. // We want to distinguish between missing replacements and deliberately
  998. // empty replacement strings.
  999. format_map_t::const_iterator iter = substitutions.find(token);
  1000. if (iter != substitutions.end())
  1001. {
  1002. replacement = iter->second;
  1003. return true;
  1004. }
  1005. // If not, see if there's one WITH brackets
  1006. iter = substitutions.find(std::string("[" + token + "]"));
  1007. if (iter != substitutions.end())
  1008. {
  1009. replacement = iter->second;
  1010. return true;
  1011. }
  1012. return false;
  1013. }
  1014. //static
  1015. template<>
  1016. bool LLStringUtil::simpleReplacement(std::string& replacement,
  1017. const std::string& token,
  1018. const LLSD& substitutions)
  1019. {
  1020. // See if we have a replacement for the bracketed string (without the
  1021. // brackets). Test first using has() because if we just look up with
  1022. // operator[] we get back an empty string even if the value is missing.
  1023. // We want to distinguish between missing replacements and deliberately
  1024. // empty replacement strings.
  1025. if (substitutions.has(token))
  1026. {
  1027. replacement = substitutions[token].asString();
  1028. return true;
  1029. }
  1030. // If not, see if there's one WITH brackets
  1031. else if (substitutions.has(std::string("[" + token + "]")))
  1032. {
  1033. replacement = substitutions[std::string("[" + token + "]")].asString();
  1034. return true;
  1035. }
  1036. return false;
  1037. }
  1038. //static
  1039. template<>
  1040. void LLStringUtil::setLocale(std::string in_locale)
  1041. {
  1042. sLocale = in_locale;
  1043. }
  1044. //static
  1045. template<>
  1046. std::string LLStringUtil::getLocale()
  1047. {
  1048. return sLocale;
  1049. }
  1050. //static
  1051. template<>
  1052. void LLStringUtil::formatNumber(std::string& num_str, S32 decimals)
  1053. {
  1054. std::stringstream str_stream;
  1055. if (!sLocale.empty())
  1056. {
  1057. // std::locale() throws if the locale is unknown ! (EXT-7926)
  1058. try
  1059. {
  1060. str_stream.imbue(std::locale(sLocale.c_str()));
  1061. }
  1062. catch (const std::exception&)
  1063. {
  1064. llwarns_once << "Cannot set locale to " << sLocale << llendl;
  1065. }
  1066. }
  1067. if (!decimals)
  1068. {
  1069. S32 int_str;
  1070. if (convertToS32(num_str, int_str))
  1071. {
  1072. str_stream << int_str;
  1073. num_str = str_stream.str();
  1074. }
  1075. }
  1076. else
  1077. {
  1078. F32 float_str;
  1079. if (convertToF32(num_str, float_str))
  1080. {
  1081. str_stream << std::fixed << std::showpoint
  1082. << std::setprecision(decimals) << float_str;
  1083. num_str = str_stream.str();
  1084. }
  1085. }
  1086. }
  1087. //static
  1088. template<>
  1089. bool LLStringUtil::formatDatetime(std::string& replacement,
  1090. const std::string& token,
  1091. const std::string& param, S32 sec_epoch)
  1092. {
  1093. if (param == "local") // Local time
  1094. {
  1095. sec_epoch -= LLStringOps::getLocalTimeOffset();
  1096. }
  1097. else if (param != "utc" && param != "gmt") // SL time
  1098. {
  1099. sec_epoch -= LLStringOps::getPacificTimeOffset();
  1100. }
  1101. // If never fell into those two ifs above, param must be utc
  1102. if (sec_epoch < 0) sec_epoch = 0;
  1103. LLDate datetime((F64)sec_epoch);
  1104. std::string code = LLStringOps::getDatetimeCode(token);
  1105. // Special case to handle timezone
  1106. if (code == "%Z")
  1107. {
  1108. if (param == "utc" || param == "gmt")
  1109. {
  1110. replacement = "UTC";
  1111. }
  1112. else if (param == "local")
  1113. {
  1114. replacement.clear(); // User knows their own timezone
  1115. }
  1116. else
  1117. {
  1118. // "slt" = Second Life Time, which is deprecated.
  1119. // If not UTC or user local time, fallback to pacific time
  1120. replacement = LLStringOps::getPacificDaylightTime() ? "PDT"
  1121. : "PST";
  1122. }
  1123. return true;
  1124. }
  1125. // EXT-7013: few codes are not suppotred by strtime function (example:
  1126. // weekdays for Japanese), so use predefined ones.
  1127. // If sWeekDayList is not empty than current locale does not support the
  1128. // weekday name.
  1129. time_t loc_seconds = (time_t) sec_epoch;
  1130. if (LLStringOps::sWeekDayList.size() == 7 && code == "%A")
  1131. {
  1132. struct tm* gmt = gmtime(&loc_seconds);
  1133. replacement = LLStringOps::sWeekDayList[gmt->tm_wday];
  1134. }
  1135. else if (LLStringOps::sWeekDayShortList.size() == 7 && code == "%a")
  1136. {
  1137. struct tm* gmt = gmtime(&loc_seconds);
  1138. replacement = LLStringOps::sWeekDayShortList[gmt->tm_wday];
  1139. }
  1140. else if (LLStringOps::sMonthList.size() == 12 && code == "%B")
  1141. {
  1142. struct tm* gmt = gmtime(&loc_seconds);
  1143. replacement = LLStringOps::sMonthList[gmt->tm_mon];
  1144. }
  1145. else if (!LLStringOps::sDayFormat.empty() && code == "%d")
  1146. {
  1147. struct tm* gmt = gmtime(&loc_seconds);
  1148. LLStringUtil::format_map_t args;
  1149. args["[MDAY]"] = llformat ("%d", gmt->tm_mday);
  1150. replacement = LLStringOps::sDayFormat;
  1151. LLStringUtil::format(replacement, args);
  1152. }
  1153. else if (code == "%-d")
  1154. {
  1155. struct tm* gmt = gmtime(&loc_seconds);
  1156. // Day of the month without leading zero
  1157. replacement = llformat ("%d", gmt->tm_mday);
  1158. }
  1159. else if (!LLStringOps::sAM.empty() && !LLStringOps::sPM.empty() &&
  1160. code == "%p")
  1161. {
  1162. struct tm* gmt = gmtime(&loc_seconds);
  1163. if (gmt->tm_hour<12)
  1164. {
  1165. replacement = LLStringOps::sAM;
  1166. }
  1167. else
  1168. {
  1169. replacement = LLStringOps::sPM;
  1170. }
  1171. }
  1172. else
  1173. {
  1174. replacement = datetime.toHTTPDateString(code.c_str());
  1175. }
  1176. // *HACK: delete leading zero from hour string in case 'hour12' (code = %I)
  1177. // time format to show time without leading zero, e.g. 08:16 -> 8:16
  1178. // (EXT-2738). We could have used '%l' format instead, but it is not
  1179. // supported by Windows.
  1180. if (code == "%I" && token == "hour12" && replacement[0] == '0')
  1181. {
  1182. replacement = replacement[1];
  1183. }
  1184. return !code.empty();
  1185. }
  1186. // LLStringUtil::format recogizes the following patterns.
  1187. // All substitutions *must* be encased in []'s in the input string. The []'s
  1188. // are optional in the substitution map.
  1189. // [FOO_123]
  1190. // [FOO,number,precision]
  1191. // [FOO,datetime,format]
  1192. //static
  1193. template<>
  1194. S32 LLStringUtil::format(std::string& s, const format_map_t& substitutions)
  1195. {
  1196. S32 res = 0;
  1197. std::string output;
  1198. std::vector<std::string> tokens;
  1199. size_t start = 0;
  1200. size_t prev_start = 0;
  1201. size_t key_start = 0;
  1202. while ((key_start = getSubstitution(s, start, tokens)) != std::string::npos)
  1203. {
  1204. output += std::string(s, prev_start, key_start-prev_start);
  1205. prev_start = start;
  1206. bool found_replacement = false;
  1207. std::string replacement;
  1208. if (tokens.size() == 0)
  1209. {
  1210. found_replacement = false;
  1211. }
  1212. else if (tokens.size() == 1)
  1213. {
  1214. found_replacement = simpleReplacement(replacement, tokens[0],
  1215. substitutions);
  1216. }
  1217. else if (tokens[1] == "number")
  1218. {
  1219. std::string param = "0";
  1220. if (tokens.size() > 2)
  1221. {
  1222. param = tokens[2];
  1223. }
  1224. found_replacement = simpleReplacement(replacement, tokens[0],
  1225. substitutions);
  1226. if (found_replacement)
  1227. {
  1228. formatNumber(replacement, atoi(param.c_str()));
  1229. }
  1230. }
  1231. else if (tokens[1] == "datetime")
  1232. {
  1233. std::string param;
  1234. if (tokens.size() > 2)
  1235. {
  1236. param = tokens[2];
  1237. }
  1238. format_map_t::const_iterator iter = substitutions.find("datetime");
  1239. if (iter != substitutions.end())
  1240. {
  1241. S32 sec_epoch = 0;
  1242. bool r = LLStringUtil::convertToS32(iter->second,
  1243. sec_epoch);
  1244. if (r)
  1245. {
  1246. found_replacement = formatDatetime(replacement, tokens[0],
  1247. param, sec_epoch);
  1248. }
  1249. }
  1250. }
  1251. if (found_replacement)
  1252. {
  1253. output += replacement;
  1254. ++res;
  1255. }
  1256. else
  1257. {
  1258. // We had no replacement, use the string as is. E.g.
  1259. // "hello [MISSING_REPLACEMENT]" or "-=[Stylized Name]=-"
  1260. output += std::string(s, key_start, start - key_start);
  1261. }
  1262. tokens.clear();
  1263. }
  1264. // Send the remainder of the string (with no further matches for bracketed
  1265. // names)
  1266. output += std::string(s, start);
  1267. s = output;
  1268. return res;
  1269. }
  1270. //static
  1271. template<>
  1272. S32 LLStringUtil::format(std::string& s, const LLSD& substitutions)
  1273. {
  1274. S32 res = 0;
  1275. if (!substitutions.isMap())
  1276. {
  1277. return res;
  1278. }
  1279. std::string output;
  1280. std::vector<std::string> tokens;
  1281. size_t start = 0;
  1282. size_t prev_start = 0;
  1283. size_t key_start = 0;
  1284. while ((key_start = getSubstitution(s, start, tokens)) != std::string::npos)
  1285. {
  1286. output += std::string(s, prev_start, key_start - prev_start);
  1287. prev_start = start;
  1288. bool found_replacement = false;
  1289. std::string replacement;
  1290. if (tokens.size() == 0)
  1291. {
  1292. found_replacement = false;
  1293. }
  1294. else if (tokens.size() == 1)
  1295. {
  1296. found_replacement = simpleReplacement(replacement, tokens[0],
  1297. substitutions);
  1298. }
  1299. else if (tokens[1] == "number")
  1300. {
  1301. std::string param = "0";
  1302. if (tokens.size() > 2)
  1303. {
  1304. param = tokens[2];
  1305. }
  1306. found_replacement = simpleReplacement(replacement, tokens[0],
  1307. substitutions);
  1308. if (found_replacement)
  1309. {
  1310. formatNumber(replacement, atoi(param.c_str()));
  1311. }
  1312. }
  1313. else if (tokens[1] == "datetime")
  1314. {
  1315. std::string param;
  1316. if (tokens.size() > 2)
  1317. {
  1318. param = tokens[2];
  1319. }
  1320. S32 sec_epoch = (S32)substitutions["datetime"].asInteger();
  1321. found_replacement = formatDatetime(replacement, tokens[0],
  1322. param, sec_epoch);
  1323. }
  1324. if (found_replacement)
  1325. {
  1326. output += replacement;
  1327. ++res;
  1328. }
  1329. else
  1330. {
  1331. // We had no replacement, use the string as is. E.g.
  1332. // "hello [MISSING_REPLACEMENT]" or "-=[Stylized Name]=-"
  1333. output += std::string(s, key_start, start-key_start);
  1334. }
  1335. tokens.clear();
  1336. }
  1337. // Send the remainder of the string (with no further matches for bracketed
  1338. // names)
  1339. output += std::string(s, start);
  1340. s = output;
  1341. return res;
  1342. }
  1343. // This used to be in separate llformat.cpp file. Moved here for coherency. HB
  1344. // Note: uses an internal buffer limited to 1024 characters (but vsnprintf
  1345. // prevents any overrun).
  1346. std::string llformat(const char* fmt, ...)
  1347. {
  1348. // Avoid allocating 1024 bytes on the stack (or worst, depending on the
  1349. // compiler: on the heap) at *each* call; instead use a static buffer in
  1350. // the thread local storage (so that we stay thread-safe). HB
  1351. thread_local char buffer[1024];
  1352. if (LL_UNLIKELY(!fmt))
  1353. {
  1354. return std::string();
  1355. }
  1356. va_list va;
  1357. va_start(va, fmt);
  1358. #if LL_WINDOWS
  1359. _vsnprintf(buffer, 1024, fmt, va);
  1360. #else
  1361. vsnprintf(buffer, 1024, fmt, va);
  1362. #endif
  1363. va_end(va);
  1364. return std::string(buffer);
  1365. }