Watchdog.cs 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389
  1. /*
  2. * Copyright (c) Contributors, http://opensimulator.org/
  3. * See CONTRIBUTORS.TXT for a full list of copyright holders.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions are met:
  7. * * Redistributions of source code must retain the above copyright
  8. * notice, this list of conditions and the following disclaimer.
  9. * * Redistributions in binary form must reproduce the above copyright
  10. * notice, this list of conditions and the following disclaimer in the
  11. * documentation and/or other materials provided with the distribution.
  12. * * Neither the name of the OpenSimulator Project nor the
  13. * names of its contributors may be used to endorse or promote products
  14. * derived from this software without specific prior written permission.
  15. *
  16. * THIS SOFTWARE IS PROVIDED BY THE DEVELOPERS ``AS IS'' AND ANY
  17. * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  18. * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  19. * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
  20. * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  21. * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  22. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  23. * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  24. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  25. * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. */
  27. using System;
  28. using System.Collections.Generic;
  29. using System.Linq;
  30. using System.Threading;
  31. using log4net;
  32. namespace OpenSim.Framework.Monitoring
  33. {
  34. /// <summary>
  35. /// Manages launching threads and keeping watch over them for timeouts
  36. /// </summary>
  37. public static class Watchdog
  38. {
  39. /// <summary>Timer interval in milliseconds for the watchdog timer</summary>
  40. public const double WATCHDOG_INTERVAL_MS = 2500.0d;
  41. /// <summary>Default timeout in milliseconds before a thread is considered dead</summary>
  42. public const int DEFAULT_WATCHDOG_TIMEOUT_MS = 5000;
  43. [System.Diagnostics.DebuggerDisplay("{Thread.Name}")]
  44. public class ThreadWatchdogInfo
  45. {
  46. public Thread Thread { get; private set; }
  47. /// <summary>
  48. /// Approximate tick when this thread was started.
  49. /// </summary>
  50. /// <remarks>
  51. /// Not terribly good since this quickly wraps around.
  52. /// </remarks>
  53. public int FirstTick { get; private set; }
  54. /// <summary>
  55. /// Last time this heartbeat update was invoked
  56. /// </summary>
  57. public int LastTick { get; set; }
  58. /// <summary>
  59. /// Number of milliseconds before we notify that the thread is having a problem.
  60. /// </summary>
  61. public int Timeout { get; set; }
  62. /// <summary>
  63. /// Is this thread considered timed out?
  64. /// </summary>
  65. public bool IsTimedOut { get; set; }
  66. /// <summary>
  67. /// Will this thread trigger the alarm function if it has timed out?
  68. /// </summary>
  69. public bool AlarmIfTimeout { get; set; }
  70. /// <summary>
  71. /// Method execute if alarm goes off. If null then no alarm method is fired.
  72. /// </summary>
  73. public Func<string> AlarmMethod { get; set; }
  74. public ThreadWatchdogInfo(Thread thread, int timeout)
  75. {
  76. Thread = thread;
  77. Timeout = timeout;
  78. FirstTick = Environment.TickCount & Int32.MaxValue;
  79. LastTick = FirstTick;
  80. }
  81. public ThreadWatchdogInfo(ThreadWatchdogInfo previousTwi)
  82. {
  83. Thread = previousTwi.Thread;
  84. FirstTick = previousTwi.FirstTick;
  85. LastTick = previousTwi.LastTick;
  86. Timeout = previousTwi.Timeout;
  87. IsTimedOut = previousTwi.IsTimedOut;
  88. AlarmIfTimeout = previousTwi.AlarmIfTimeout;
  89. AlarmMethod = previousTwi.AlarmMethod;
  90. }
  91. }
  92. /// <summary>
  93. /// This event is called whenever a tracked thread is
  94. /// stopped or has not called UpdateThread() in time<
  95. /// /summary>
  96. public static event Action<ThreadWatchdogInfo> OnWatchdogTimeout;
  97. /// <summary>
  98. /// Is this watchdog active?
  99. /// </summary>
  100. public static bool Enabled
  101. {
  102. get { return m_enabled; }
  103. set
  104. {
  105. // m_log.DebugFormat("[MEMORY WATCHDOG]: Setting MemoryWatchdog.Enabled to {0}", value);
  106. if (value == m_enabled)
  107. return;
  108. m_enabled = value;
  109. if (m_enabled)
  110. {
  111. // Set now so we don't get alerted on the first run
  112. LastWatchdogThreadTick = Environment.TickCount & Int32.MaxValue;
  113. }
  114. m_watchdogTimer.Enabled = m_enabled;
  115. }
  116. }
  117. private static bool m_enabled;
  118. private static readonly ILog m_log = LogManager.GetLogger(System.Reflection.MethodBase.GetCurrentMethod().DeclaringType);
  119. private static Dictionary<int, ThreadWatchdogInfo> m_threads;
  120. private static System.Timers.Timer m_watchdogTimer;
  121. /// <summary>
  122. /// Last time the watchdog thread ran.
  123. /// </summary>
  124. /// <remarks>
  125. /// Should run every WATCHDOG_INTERVAL_MS
  126. /// </remarks>
  127. public static int LastWatchdogThreadTick { get; private set; }
  128. static Watchdog()
  129. {
  130. m_threads = new Dictionary<int, ThreadWatchdogInfo>();
  131. m_watchdogTimer = new System.Timers.Timer(WATCHDOG_INTERVAL_MS);
  132. m_watchdogTimer.AutoReset = false;
  133. m_watchdogTimer.Elapsed += WatchdogTimerElapsed;
  134. }
  135. /// <summary>
  136. /// Start a new thread that is tracked by the watchdog timer.
  137. /// </summary>
  138. /// <param name="start">The method that will be executed in a new thread</param>
  139. /// <param name="name">A name to give to the new thread</param>
  140. /// <param name="priority">Priority to run the thread at</param>
  141. /// <param name="isBackground">True to run this thread as a background thread, otherwise false</param>
  142. /// <param name="alarmIfTimeout">Trigger an alarm function is we have timed out</param>
  143. /// <returns>The newly created Thread object</returns>
  144. public static Thread StartThread(
  145. ThreadStart start, string name, ThreadPriority priority, bool isBackground, bool alarmIfTimeout)
  146. {
  147. return StartThread(start, name, priority, isBackground, alarmIfTimeout, null, DEFAULT_WATCHDOG_TIMEOUT_MS);
  148. }
  149. /// <summary>
  150. /// Start a new thread that is tracked by the watchdog timer
  151. /// </summary>
  152. /// <param name="start">The method that will be executed in a new thread</param>
  153. /// <param name="name">A name to give to the new thread</param>
  154. /// <param name="priority">Priority to run the thread at</param>
  155. /// <param name="isBackground">True to run this thread as a background
  156. /// thread, otherwise false</param>
  157. /// <param name="alarmIfTimeout">Trigger an alarm function is we have timed out</param>
  158. /// <param name="alarmMethod">
  159. /// Alarm method to call if alarmIfTimeout is true and there is a timeout.
  160. /// Normally, this will just return some useful debugging information.
  161. /// </param>
  162. /// <param name="timeout">Number of milliseconds to wait until we issue a warning about timeout.</param>
  163. /// <returns>The newly created Thread object</returns>
  164. public static Thread StartThread(
  165. ThreadStart start, string name, ThreadPriority priority, bool isBackground,
  166. bool alarmIfTimeout, Func<string> alarmMethod, int timeout)
  167. {
  168. Thread thread = new Thread(start);
  169. thread.Name = name;
  170. thread.Priority = priority;
  171. thread.IsBackground = isBackground;
  172. ThreadWatchdogInfo twi
  173. = new ThreadWatchdogInfo(thread, timeout)
  174. { AlarmIfTimeout = alarmIfTimeout, AlarmMethod = alarmMethod };
  175. m_log.DebugFormat(
  176. "[WATCHDOG]: Started tracking thread {0}, ID {1}", twi.Thread.Name, twi.Thread.ManagedThreadId);
  177. lock (m_threads)
  178. m_threads.Add(twi.Thread.ManagedThreadId, twi);
  179. thread.Start();
  180. return thread;
  181. }
  182. /// <summary>
  183. /// Marks the current thread as alive
  184. /// </summary>
  185. public static void UpdateThread()
  186. {
  187. UpdateThread(Thread.CurrentThread.ManagedThreadId);
  188. }
  189. /// <summary>
  190. /// Stops watchdog tracking on the current thread
  191. /// </summary>
  192. /// <returns>
  193. /// True if the thread was removed from the list of tracked
  194. /// threads, otherwise false
  195. /// </returns>
  196. public static bool RemoveThread()
  197. {
  198. return RemoveThread(Thread.CurrentThread.ManagedThreadId);
  199. }
  200. private static bool RemoveThread(int threadID)
  201. {
  202. lock (m_threads)
  203. {
  204. ThreadWatchdogInfo twi;
  205. if (m_threads.TryGetValue(threadID, out twi))
  206. {
  207. m_log.DebugFormat(
  208. "[WATCHDOG]: Removing thread {0}, ID {1}", twi.Thread.Name, twi.Thread.ManagedThreadId);
  209. m_threads.Remove(threadID);
  210. return true;
  211. }
  212. else
  213. {
  214. m_log.WarnFormat(
  215. "[WATCHDOG]: Requested to remove thread with ID {0} but this is not being monitored", threadID);
  216. return false;
  217. }
  218. }
  219. }
  220. public static bool AbortThread(int threadID)
  221. {
  222. lock (m_threads)
  223. {
  224. if (m_threads.ContainsKey(threadID))
  225. {
  226. ThreadWatchdogInfo twi = m_threads[threadID];
  227. twi.Thread.Abort();
  228. RemoveThread(threadID);
  229. return true;
  230. }
  231. else
  232. {
  233. return false;
  234. }
  235. }
  236. }
  237. private static void UpdateThread(int threadID)
  238. {
  239. ThreadWatchdogInfo threadInfo;
  240. // Although TryGetValue is not a thread safe operation, we use a try/catch here instead
  241. // of a lock for speed. Adding/removing threads is a very rare operation compared to
  242. // UpdateThread(), and a single UpdateThread() failure here and there won't break
  243. // anything
  244. try
  245. {
  246. if (m_threads.TryGetValue(threadID, out threadInfo))
  247. {
  248. threadInfo.LastTick = Environment.TickCount & Int32.MaxValue;
  249. threadInfo.IsTimedOut = false;
  250. }
  251. else
  252. {
  253. m_log.WarnFormat("[WATCHDOG]: Asked to update thread {0} which is not being monitored", threadID);
  254. }
  255. }
  256. catch { }
  257. }
  258. /// <summary>
  259. /// Get currently watched threads for diagnostic purposes
  260. /// </summary>
  261. /// <returns></returns>
  262. public static ThreadWatchdogInfo[] GetThreadsInfo()
  263. {
  264. lock (m_threads)
  265. return m_threads.Values.ToArray();
  266. }
  267. /// <summary>
  268. /// Return the current thread's watchdog info.
  269. /// </summary>
  270. /// <returns>The watchdog info. null if the thread isn't being monitored.</returns>
  271. public static ThreadWatchdogInfo GetCurrentThreadInfo()
  272. {
  273. lock (m_threads)
  274. {
  275. if (m_threads.ContainsKey(Thread.CurrentThread.ManagedThreadId))
  276. return m_threads[Thread.CurrentThread.ManagedThreadId];
  277. }
  278. return null;
  279. }
  280. /// <summary>
  281. /// Check watched threads. Fire alarm if appropriate.
  282. /// </summary>
  283. /// <param name="sender"></param>
  284. /// <param name="e"></param>
  285. private static void WatchdogTimerElapsed(object sender, System.Timers.ElapsedEventArgs e)
  286. {
  287. int now = Environment.TickCount & Int32.MaxValue;
  288. int msElapsed = now - LastWatchdogThreadTick;
  289. if (msElapsed > WATCHDOG_INTERVAL_MS * 2)
  290. m_log.WarnFormat(
  291. "[WATCHDOG]: {0} ms since Watchdog last ran. Interval should be approximately {1} ms",
  292. msElapsed, WATCHDOG_INTERVAL_MS);
  293. LastWatchdogThreadTick = Environment.TickCount & Int32.MaxValue;
  294. Action<ThreadWatchdogInfo> callback = OnWatchdogTimeout;
  295. if (callback != null)
  296. {
  297. List<ThreadWatchdogInfo> callbackInfos = null;
  298. lock (m_threads)
  299. {
  300. foreach (ThreadWatchdogInfo threadInfo in m_threads.Values)
  301. {
  302. if (threadInfo.Thread.ThreadState == ThreadState.Stopped)
  303. {
  304. RemoveThread(threadInfo.Thread.ManagedThreadId);
  305. if (callbackInfos == null)
  306. callbackInfos = new List<ThreadWatchdogInfo>();
  307. callbackInfos.Add(threadInfo);
  308. }
  309. else if (!threadInfo.IsTimedOut && now - threadInfo.LastTick >= threadInfo.Timeout)
  310. {
  311. threadInfo.IsTimedOut = true;
  312. if (threadInfo.AlarmIfTimeout)
  313. {
  314. if (callbackInfos == null)
  315. callbackInfos = new List<ThreadWatchdogInfo>();
  316. // Send a copy of the watchdog info to prevent race conditions where the watchdog
  317. // thread updates the monitoring info after an alarm has been sent out.
  318. callbackInfos.Add(new ThreadWatchdogInfo(threadInfo));
  319. }
  320. }
  321. }
  322. }
  323. if (callbackInfos != null)
  324. foreach (ThreadWatchdogInfo callbackInfo in callbackInfos)
  325. callback(callbackInfo);
  326. }
  327. if (MemoryWatchdog.Enabled)
  328. MemoryWatchdog.Update();
  329. ChecksManager.CheckChecks();
  330. StatsManager.RecordStats();
  331. m_watchdogTimer.Start();
  332. }
  333. }
  334. }