Watchdog.cs 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506
  1. /*
  2. * Copyright (c) Contributors, http://opensimulator.org/
  3. * See CONTRIBUTORS.TXT for a full list of copyright holders.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions are met:
  7. * * Redistributions of source code must retain the above copyright
  8. * notice, this list of conditions and the following disclaimer.
  9. * * Redistributions in binary form must reproduce the above copyright
  10. * notice, this list of conditions and the following disclaimer in the
  11. * documentation and/or other materials provided with the distribution.
  12. * * Neither the name of the OpenSimulator Project nor the
  13. * names of its contributors may be used to endorse or promote products
  14. * derived from this software without specific prior written permission.
  15. *
  16. * THIS SOFTWARE IS PROVIDED BY THE DEVELOPERS ``AS IS'' AND ANY
  17. * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  18. * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  19. * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
  20. * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  21. * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  22. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  23. * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  24. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  25. * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. */
  27. using System;
  28. using System.Collections.Generic;
  29. using System.Linq;
  30. using System.Threading;
  31. using log4net;
  32. namespace OpenSim.Framework.Monitoring
  33. {
  34. /// <summary>
  35. /// Manages launching threads and keeping watch over them for timeouts
  36. /// </summary>
  37. public static class Watchdog
  38. {
  39. /// <summary>Timer interval in milliseconds for the watchdog timer</summary>
  40. public const double WATCHDOG_INTERVAL_MS = 2500.0d;
  41. /// <summary>Default timeout in milliseconds before a thread is considered dead</summary>
  42. public const int DEFAULT_WATCHDOG_TIMEOUT_MS = 5000;
  43. [System.Diagnostics.DebuggerDisplay("{Thread.Name}")]
  44. public class ThreadWatchdogInfo
  45. {
  46. public Thread Thread { get; private set; }
  47. /// <summary>
  48. /// Approximate tick when this thread was started.
  49. /// </summary>
  50. /// <remarks>
  51. /// Not terribly good since this quickly wraps around.
  52. /// </remarks>
  53. public int FirstTick { get; private set; }
  54. /// <summary>
  55. /// Last time this heartbeat update was invoked
  56. /// </summary>
  57. public int LastTick { get; set; }
  58. /// <summary>
  59. /// Number of milliseconds before we notify that the thread is having a problem.
  60. /// </summary>
  61. public int Timeout { get; set; }
  62. /// <summary>
  63. /// Is this thread considered timed out?
  64. /// </summary>
  65. public bool IsTimedOut { get; set; }
  66. /// <summary>
  67. /// Will this thread trigger the alarm function if it has timed out?
  68. /// </summary>
  69. public bool AlarmIfTimeout { get; set; }
  70. /// <summary>
  71. /// Method execute if alarm goes off. If null then no alarm method is fired.
  72. /// </summary>
  73. public Func<string> AlarmMethod { get; set; }
  74. /// <summary>
  75. /// Stat structure associated with this thread.
  76. /// </summary>
  77. public Stat Stat { get; set; }
  78. public ThreadWatchdogInfo(Thread thread, int timeout)
  79. {
  80. Thread = thread;
  81. Timeout = timeout;
  82. FirstTick = Environment.TickCount & Int32.MaxValue;
  83. LastTick = FirstTick;
  84. Stat
  85. = new Stat(
  86. thread.Name,
  87. string.Format("Last update of thread {0}", thread.Name),
  88. "",
  89. "ms",
  90. "server",
  91. "thread",
  92. StatType.Pull,
  93. MeasuresOfInterest.None,
  94. stat => stat.Value = Environment.TickCount & Int32.MaxValue - LastTick,
  95. StatVerbosity.Debug);
  96. StatsManager.RegisterStat(Stat);
  97. }
  98. public ThreadWatchdogInfo(ThreadWatchdogInfo previousTwi)
  99. {
  100. Thread = previousTwi.Thread;
  101. FirstTick = previousTwi.FirstTick;
  102. LastTick = previousTwi.LastTick;
  103. Timeout = previousTwi.Timeout;
  104. IsTimedOut = previousTwi.IsTimedOut;
  105. AlarmIfTimeout = previousTwi.AlarmIfTimeout;
  106. AlarmMethod = previousTwi.AlarmMethod;
  107. }
  108. public void Cleanup()
  109. {
  110. StatsManager.DeregisterStat(Stat);
  111. }
  112. }
  113. /// <summary>
  114. /// This event is called whenever a tracked thread is
  115. /// stopped or has not called UpdateThread() in time<
  116. /// /summary>
  117. public static event Action<ThreadWatchdogInfo> OnWatchdogTimeout;
  118. public static JobEngine JobEngine { get; private set; }
  119. /// <summary>
  120. /// Is this watchdog active?
  121. /// </summary>
  122. public static bool Enabled
  123. {
  124. get { return m_enabled; }
  125. set
  126. {
  127. // m_log.DebugFormat("[MEMORY WATCHDOG]: Setting MemoryWatchdog.Enabled to {0}", value);
  128. if (value == m_enabled)
  129. return;
  130. m_enabled = value;
  131. if (m_enabled)
  132. {
  133. // Set now so we don't get alerted on the first run
  134. LastWatchdogThreadTick = Environment.TickCount & Int32.MaxValue;
  135. }
  136. m_watchdogTimer.Enabled = m_enabled;
  137. }
  138. }
  139. private static bool m_enabled;
  140. private static readonly ILog m_log = LogManager.GetLogger(System.Reflection.MethodBase.GetCurrentMethod().DeclaringType);
  141. private static Dictionary<int, ThreadWatchdogInfo> m_threads;
  142. private static System.Timers.Timer m_watchdogTimer;
  143. /// <summary>
  144. /// Last time the watchdog thread ran.
  145. /// </summary>
  146. /// <remarks>
  147. /// Should run every WATCHDOG_INTERVAL_MS
  148. /// </remarks>
  149. public static int LastWatchdogThreadTick { get; private set; }
  150. static Watchdog()
  151. {
  152. JobEngine = new JobEngine();
  153. m_threads = new Dictionary<int, ThreadWatchdogInfo>();
  154. m_watchdogTimer = new System.Timers.Timer(WATCHDOG_INTERVAL_MS);
  155. m_watchdogTimer.AutoReset = false;
  156. m_watchdogTimer.Elapsed += WatchdogTimerElapsed;
  157. }
  158. /// <summary>
  159. /// Start a new thread that is tracked by the watchdog timer.
  160. /// </summary>
  161. /// <param name="start">The method that will be executed in a new thread</param>
  162. /// <param name="name">A name to give to the new thread</param>
  163. /// <param name="priority">Priority to run the thread at</param>
  164. /// <param name="isBackground">True to run this thread as a background thread, otherwise false</param>
  165. /// <param name="alarmIfTimeout">Trigger an alarm function is we have timed out</param>
  166. /// <param name="log">If true then creation of thread is logged.</param>
  167. /// <returns>The newly created Thread object</returns>
  168. public static Thread StartThread(
  169. ThreadStart start, string name, ThreadPriority priority, bool isBackground, bool alarmIfTimeout, bool log = true)
  170. {
  171. return StartThread(start, name, priority, isBackground, alarmIfTimeout, null, DEFAULT_WATCHDOG_TIMEOUT_MS, log);
  172. }
  173. /// <summary>
  174. /// Start a new thread that is tracked by the watchdog
  175. /// </summary>
  176. /// <param name="start">The method that will be executed in a new thread</param>
  177. /// <param name="name">A name to give to the new thread</param>
  178. /// <param name="priority">Priority to run the thread at</param>
  179. /// <param name="isBackground">True to run this thread as a background
  180. /// thread, otherwise false</param>
  181. /// <param name="alarmIfTimeout">Trigger an alarm function is we have timed out</param>
  182. /// <param name="alarmMethod">
  183. /// Alarm method to call if alarmIfTimeout is true and there is a timeout.
  184. /// Normally, this will just return some useful debugging information.
  185. /// </param>
  186. /// <param name="timeout">Number of milliseconds to wait until we issue a warning about timeout.</param>
  187. /// <param name="log">If true then creation of thread is logged.</param>
  188. /// <returns>The newly created Thread object</returns>
  189. public static Thread StartThread(
  190. ThreadStart start, string name, ThreadPriority priority, bool isBackground,
  191. bool alarmIfTimeout, Func<string> alarmMethod, int timeout, bool log = true)
  192. {
  193. Thread thread = new Thread(start);
  194. thread.Name = name;
  195. thread.Priority = priority;
  196. thread.IsBackground = isBackground;
  197. ThreadWatchdogInfo twi
  198. = new ThreadWatchdogInfo(thread, timeout)
  199. { AlarmIfTimeout = alarmIfTimeout, AlarmMethod = alarmMethod };
  200. if (log)
  201. m_log.DebugFormat(
  202. "[WATCHDOG]: Started tracking thread {0}, ID {1}", twi.Thread.Name, twi.Thread.ManagedThreadId);
  203. lock (m_threads)
  204. m_threads.Add(twi.Thread.ManagedThreadId, twi);
  205. thread.Start();
  206. return thread;
  207. }
  208. /// <summary>
  209. /// Run the callback in a new thread immediately. If the thread exits with an exception log it but do
  210. /// not propogate it.
  211. /// </summary>
  212. /// <param name="callback">Code for the thread to execute.</param>
  213. /// <param name="name">Name of the thread</param>
  214. /// <param name="obj">Object to pass to the thread.</param>
  215. public static void RunInThread(WaitCallback callback, string name, object obj, bool log = false)
  216. {
  217. if (Util.FireAndForgetMethod == FireAndForgetMethod.RegressionTest)
  218. {
  219. Culture.SetCurrentCulture();
  220. callback(obj);
  221. return;
  222. }
  223. ThreadStart ts = new ThreadStart(delegate()
  224. {
  225. try
  226. {
  227. Culture.SetCurrentCulture();
  228. callback(obj);
  229. Watchdog.RemoveThread(log:false);
  230. }
  231. catch (Exception e)
  232. {
  233. m_log.Error(string.Format("[WATCHDOG]: Exception in thread {0}.", name), e);
  234. }
  235. });
  236. StartThread(ts, name, ThreadPriority.Normal, true, false, log:log);
  237. }
  238. /// <summary>
  239. /// Marks the current thread as alive
  240. /// </summary>
  241. public static void UpdateThread()
  242. {
  243. UpdateThread(Thread.CurrentThread.ManagedThreadId);
  244. }
  245. /// <summary>
  246. /// Stops watchdog tracking on the current thread
  247. /// </summary>
  248. /// <param name="log">If true then normal events in thread removal are not logged.</param>
  249. /// <returns>
  250. /// True if the thread was removed from the list of tracked
  251. /// threads, otherwise false
  252. /// </returns>
  253. public static bool RemoveThread(bool log = true)
  254. {
  255. return RemoveThread(Thread.CurrentThread.ManagedThreadId, log);
  256. }
  257. private static bool RemoveThread(int threadID, bool log = true)
  258. {
  259. lock (m_threads)
  260. {
  261. ThreadWatchdogInfo twi;
  262. if (m_threads.TryGetValue(threadID, out twi))
  263. {
  264. if (log)
  265. m_log.DebugFormat(
  266. "[WATCHDOG]: Removing thread {0}, ID {1}", twi.Thread.Name, twi.Thread.ManagedThreadId);
  267. twi.Cleanup();
  268. m_threads.Remove(threadID);
  269. return true;
  270. }
  271. else
  272. {
  273. m_log.WarnFormat(
  274. "[WATCHDOG]: Requested to remove thread with ID {0} but this is not being monitored", threadID);
  275. return false;
  276. }
  277. }
  278. }
  279. public static bool AbortThread(int threadID)
  280. {
  281. lock (m_threads)
  282. {
  283. if (m_threads.ContainsKey(threadID))
  284. {
  285. ThreadWatchdogInfo twi = m_threads[threadID];
  286. twi.Thread.Abort();
  287. RemoveThread(threadID);
  288. return true;
  289. }
  290. else
  291. {
  292. return false;
  293. }
  294. }
  295. }
  296. private static void UpdateThread(int threadID)
  297. {
  298. ThreadWatchdogInfo threadInfo;
  299. // Although TryGetValue is not a thread safe operation, we use a try/catch here instead
  300. // of a lock for speed. Adding/removing threads is a very rare operation compared to
  301. // UpdateThread(), and a single UpdateThread() failure here and there won't break
  302. // anything
  303. try
  304. {
  305. if (m_threads.TryGetValue(threadID, out threadInfo))
  306. {
  307. threadInfo.LastTick = Environment.TickCount & Int32.MaxValue;
  308. threadInfo.IsTimedOut = false;
  309. }
  310. else
  311. {
  312. m_log.WarnFormat("[WATCHDOG]: Asked to update thread {0} which is not being monitored", threadID);
  313. }
  314. }
  315. catch { }
  316. }
  317. /// <summary>
  318. /// Get currently watched threads for diagnostic purposes
  319. /// </summary>
  320. /// <returns></returns>
  321. public static ThreadWatchdogInfo[] GetThreadsInfo()
  322. {
  323. lock (m_threads)
  324. return m_threads.Values.ToArray();
  325. }
  326. /// <summary>
  327. /// Return the current thread's watchdog info.
  328. /// </summary>
  329. /// <returns>The watchdog info. null if the thread isn't being monitored.</returns>
  330. public static ThreadWatchdogInfo GetCurrentThreadInfo()
  331. {
  332. lock (m_threads)
  333. {
  334. if (m_threads.ContainsKey(Thread.CurrentThread.ManagedThreadId))
  335. return m_threads[Thread.CurrentThread.ManagedThreadId];
  336. }
  337. return null;
  338. }
  339. /// <summary>
  340. /// Check watched threads. Fire alarm if appropriate.
  341. /// </summary>
  342. /// <param name="sender"></param>
  343. /// <param name="e"></param>
  344. private static void WatchdogTimerElapsed(object sender, System.Timers.ElapsedEventArgs e)
  345. {
  346. int now = Environment.TickCount & Int32.MaxValue;
  347. int msElapsed = now - LastWatchdogThreadTick;
  348. if (msElapsed > WATCHDOG_INTERVAL_MS * 2)
  349. m_log.WarnFormat(
  350. "[WATCHDOG]: {0} ms since Watchdog last ran. Interval should be approximately {1} ms",
  351. msElapsed, WATCHDOG_INTERVAL_MS);
  352. LastWatchdogThreadTick = Environment.TickCount & Int32.MaxValue;
  353. Action<ThreadWatchdogInfo> callback = OnWatchdogTimeout;
  354. if (callback != null)
  355. {
  356. List<ThreadWatchdogInfo> callbackInfos = null;
  357. lock (m_threads)
  358. {
  359. foreach (ThreadWatchdogInfo threadInfo in m_threads.Values)
  360. {
  361. if (threadInfo.Thread.ThreadState == ThreadState.Stopped)
  362. {
  363. RemoveThread(threadInfo.Thread.ManagedThreadId);
  364. if (callbackInfos == null)
  365. callbackInfos = new List<ThreadWatchdogInfo>();
  366. callbackInfos.Add(threadInfo);
  367. }
  368. else if (!threadInfo.IsTimedOut && now - threadInfo.LastTick >= threadInfo.Timeout)
  369. {
  370. threadInfo.IsTimedOut = true;
  371. if (threadInfo.AlarmIfTimeout)
  372. {
  373. if (callbackInfos == null)
  374. callbackInfos = new List<ThreadWatchdogInfo>();
  375. // Send a copy of the watchdog info to prevent race conditions where the watchdog
  376. // thread updates the monitoring info after an alarm has been sent out.
  377. callbackInfos.Add(new ThreadWatchdogInfo(threadInfo));
  378. }
  379. }
  380. }
  381. }
  382. if (callbackInfos != null)
  383. foreach (ThreadWatchdogInfo callbackInfo in callbackInfos)
  384. callback(callbackInfo);
  385. }
  386. if (MemoryWatchdog.Enabled)
  387. MemoryWatchdog.Update();
  388. ChecksManager.CheckChecks();
  389. StatsManager.RecordStats();
  390. m_watchdogTimer.Start();
  391. }
  392. /// <summary>
  393. /// Run a job.
  394. /// </summary>
  395. /// <remarks>
  396. /// This differs from direct scheduling (e.g. Util.FireAndForget) in that a job can be run in the job
  397. /// engine if it is running, where all jobs are currently performed in sequence on a single thread. This is
  398. /// to prevent observed overload and server freeze problems when there are hundreds of connections which all attempt to
  399. /// perform work at once (e.g. in conference situations). With lower numbers of connections, the small
  400. /// delay in performing jobs in sequence rather than concurrently has not been notiecable in testing, though a future more
  401. /// sophisticated implementation could perform jobs concurrently when the server is under low load.
  402. ///
  403. /// However, be advised that some callers of this function rely on all jobs being performed in sequence if any
  404. /// jobs are performed in sequence (i.e. if jobengine is active or not). Therefore, expanding the jobengine
  405. /// beyond a single thread will require considerable thought.
  406. ///
  407. /// Also, any jobs submitted must be guaranteed to complete within a reasonable timeframe (e.g. they cannot
  408. /// incorporate a network delay with a long timeout). At the moment, work that could suffer such issues
  409. /// should still be run directly with RunInThread(), Util.FireAndForget(), etc. This is another area where
  410. /// the job engine could be improved and so CPU utilization improved by better management of concurrency within
  411. /// OpenSimulator.
  412. /// </remarks>
  413. /// <param name="jobType">General classification for the job (e.g. "RezAttachments").</param>
  414. /// <param name="callback">Callback for job.</param>
  415. /// <param name="name">Specific name of job (e.g. "RezAttachments for Joe Bloggs"</param>
  416. /// <param name="obj">Object to pass to callback when run</param>
  417. /// <param name="canRunInThisThread">If set to true then the job may be run in ths calling thread.</param>
  418. /// <param name="mustNotTimeout">If the true then the job must never timeout.</param>
  419. /// <param name="log">If set to true then extra logging is performed.</param>
  420. public static void RunJob(
  421. string jobType, WaitCallback callback, string name, object obj,
  422. bool canRunInThisThread = false, bool mustNotTimeout = false,
  423. bool log = false)
  424. {
  425. if (Util.FireAndForgetMethod == FireAndForgetMethod.RegressionTest)
  426. {
  427. Culture.SetCurrentCulture();
  428. callback(obj);
  429. return;
  430. }
  431. if (JobEngine.IsRunning)
  432. JobEngine.QueueRequest(name, callback, obj);
  433. else if (canRunInThisThread)
  434. callback(obj);
  435. else if (mustNotTimeout)
  436. RunInThread(callback, name, obj, log);
  437. else
  438. Util.FireAndForget(callback, obj, name);
  439. }
  440. }
  441. }