Watchdog.cs 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420
  1. /*
  2. * Copyright (c) Contributors, http://opensimulator.org/
  3. * See CONTRIBUTORS.TXT for a full list of copyright holders.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions are met:
  7. * * Redistributions of source code must retain the above copyright
  8. * notice, this list of conditions and the following disclaimer.
  9. * * Redistributions in binary form must reproduce the above copyright
  10. * notice, this list of conditions and the following disclaimer in the
  11. * documentation and/or other materials provided with the distribution.
  12. * * Neither the name of the OpenSimulator Project nor the
  13. * names of its contributors may be used to endorse or promote products
  14. * derived from this software without specific prior written permission.
  15. *
  16. * THIS SOFTWARE IS PROVIDED BY THE DEVELOPERS ``AS IS'' AND ANY
  17. * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  18. * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  19. * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
  20. * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  21. * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  22. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  23. * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  24. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  25. * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. */
  27. using System;
  28. using System.Collections.Generic;
  29. using System.Linq;
  30. using System.Threading;
  31. using log4net;
  32. namespace OpenSim.Framework.Monitoring
  33. {
  34. /// <summary>
  35. /// Manages launching threads and keeping watch over them for timeouts
  36. /// </summary>
  37. public static class Watchdog
  38. {
  39. private static readonly ILog m_log = LogManager.GetLogger(System.Reflection.MethodBase.GetCurrentMethod().DeclaringType);
  40. /// <summary>Timer interval in milliseconds for the watchdog timer</summary>
  41. public const double WATCHDOG_INTERVAL_MS = 2500.0d;
  42. /// <summary>Default timeout in milliseconds before a thread is considered dead</summary>
  43. public const int DEFAULT_WATCHDOG_TIMEOUT_MS = 5000;
  44. [System.Diagnostics.DebuggerDisplay("{Thread.Name}")]
  45. public class ThreadWatchdogInfo
  46. {
  47. public Thread Thread { get; private set; }
  48. /// <summary>
  49. /// Approximate tick when this thread was started.
  50. /// </summary>
  51. /// <remarks>
  52. /// Not terribly good since this quickly wraps around.
  53. /// </remarks>
  54. public int FirstTick { get; private set; }
  55. /// <summary>
  56. /// Last time this heartbeat update was invoked
  57. /// </summary>
  58. public int LastTick { get; set; }
  59. /// <summary>
  60. /// Number of milliseconds before we notify that the thread is having a problem.
  61. /// </summary>
  62. public int Timeout { get; set; }
  63. /// <summary>
  64. /// Is this thread considered timed out?
  65. /// </summary>
  66. public bool IsTimedOut { get; set; }
  67. /// <summary>
  68. /// Will this thread trigger the alarm function if it has timed out?
  69. /// </summary>
  70. public bool AlarmIfTimeout { get; set; }
  71. /// <summary>
  72. /// Method execute if alarm goes off. If null then no alarm method is fired.
  73. /// </summary>
  74. public Func<string> AlarmMethod { get; set; }
  75. /// <summary>
  76. /// Stat structure associated with this thread.
  77. /// </summary>
  78. public Stat Stat { get; set; }
  79. public ThreadWatchdogInfo(Thread thread, int timeout, string name)
  80. {
  81. Thread = thread;
  82. Timeout = timeout;
  83. FirstTick = Environment.TickCount & Int32.MaxValue;
  84. LastTick = FirstTick;
  85. Stat
  86. = new Stat(
  87. name,
  88. string.Format("Last update of thread {0}", name),
  89. "",
  90. "ms",
  91. "server",
  92. "thread",
  93. StatType.Pull,
  94. MeasuresOfInterest.None,
  95. stat => stat.Value = Environment.TickCount & Int32.MaxValue - LastTick,
  96. StatVerbosity.Debug);
  97. StatsManager.RegisterStat(Stat);
  98. }
  99. public ThreadWatchdogInfo(ThreadWatchdogInfo previousTwi)
  100. {
  101. Thread = previousTwi.Thread;
  102. FirstTick = previousTwi.FirstTick;
  103. LastTick = previousTwi.LastTick;
  104. Timeout = previousTwi.Timeout;
  105. IsTimedOut = previousTwi.IsTimedOut;
  106. AlarmIfTimeout = previousTwi.AlarmIfTimeout;
  107. AlarmMethod = previousTwi.AlarmMethod;
  108. }
  109. public void Cleanup()
  110. {
  111. StatsManager.DeregisterStat(Stat);
  112. }
  113. }
  114. /// <summary>
  115. /// This event is called whenever a tracked thread is
  116. /// stopped or has not called UpdateThread() in time<
  117. /// /summary>
  118. public static event Action<ThreadWatchdogInfo> OnWatchdogTimeout;
  119. /// <summary>
  120. /// Is this watchdog active?
  121. /// </summary>
  122. public static bool Enabled
  123. {
  124. get { return m_enabled; }
  125. set
  126. {
  127. // m_log.DebugFormat("[MEMORY WATCHDOG]: Setting MemoryWatchdog.Enabled to {0}", value);
  128. if (value == m_enabled)
  129. return;
  130. m_enabled = value;
  131. if (m_enabled)
  132. {
  133. // Set now so we don't get alerted on the first run
  134. LastWatchdogThreadTick = Environment.TickCount & Int32.MaxValue;
  135. }
  136. m_watchdogTimer.Enabled = m_enabled;
  137. }
  138. }
  139. private static bool m_enabled;
  140. private static Dictionary<int, ThreadWatchdogInfo> m_threads;
  141. private static System.Timers.Timer m_watchdogTimer;
  142. /// <summary>
  143. /// Last time the watchdog thread ran.
  144. /// </summary>
  145. /// <remarks>
  146. /// Should run every WATCHDOG_INTERVAL_MS
  147. /// </remarks>
  148. public static int LastWatchdogThreadTick { get; private set; }
  149. static Watchdog()
  150. {
  151. m_threads = new Dictionary<int, ThreadWatchdogInfo>();
  152. m_watchdogTimer = new System.Timers.Timer(WATCHDOG_INTERVAL_MS);
  153. m_watchdogTimer.AutoReset = false;
  154. m_watchdogTimer.Elapsed += WatchdogTimerElapsed;
  155. }
  156. public static void Stop()
  157. {
  158. if(m_threads == null)
  159. return;
  160. lock(m_threads)
  161. {
  162. m_enabled = false;
  163. if(m_watchdogTimer != null)
  164. {
  165. m_watchdogTimer.Dispose();
  166. m_watchdogTimer = null;
  167. }
  168. foreach(ThreadWatchdogInfo twi in m_threads.Values)
  169. {
  170. Thread t = twi.Thread;
  171. // m_log.DebugFormat(
  172. // "[WATCHDOG]: Stop: Removing thread {0}, ID {1}", twi.Thread.Name, twi.Thread.ManagedThreadId);
  173. if(t.IsAlive)
  174. t.Abort();
  175. }
  176. m_threads.Clear();
  177. }
  178. }
  179. /// <summary>
  180. /// Add a thread to the watchdog tracker.
  181. /// </summary>
  182. /// <param name="info">Information about the thread.</info>
  183. /// <param name="info">Name of the thread.</info>
  184. /// <param name="log">If true then creation of thread is logged.</param>
  185. public static void AddThread(ThreadWatchdogInfo info, string name, bool log = true)
  186. {
  187. if (log)
  188. m_log.DebugFormat(
  189. "[WATCHDOG]: Started tracking thread {0}, ID {1}", name, info.Thread.ManagedThreadId);
  190. lock (m_threads)
  191. m_threads.Add(info.Thread.ManagedThreadId, info);
  192. }
  193. /// <summary>
  194. /// Marks the current thread as alive
  195. /// </summary>
  196. public static void UpdateThread()
  197. {
  198. UpdateThread(Thread.CurrentThread.ManagedThreadId);
  199. }
  200. /// <summary>
  201. /// Stops watchdog tracking on the current thread
  202. /// </summary>
  203. /// <param name="log">If true then normal events in thread removal are not logged.</param>
  204. /// <returns>
  205. /// True if the thread was removed from the list of tracked
  206. /// threads, otherwise false
  207. /// </returns>
  208. public static bool RemoveThread(bool log = true)
  209. {
  210. return RemoveThread(Thread.CurrentThread.ManagedThreadId, log);
  211. }
  212. private static bool RemoveThread(int threadID, bool log = true)
  213. {
  214. lock (m_threads)
  215. {
  216. ThreadWatchdogInfo twi;
  217. if (m_threads.TryGetValue(threadID, out twi))
  218. {
  219. if (log)
  220. m_log.DebugFormat(
  221. "[WATCHDOG]: Removing thread {0}, ID {1}", twi.Thread.Name, twi.Thread.ManagedThreadId);
  222. twi.Cleanup();
  223. m_threads.Remove(threadID);
  224. return true;
  225. }
  226. else
  227. {
  228. m_log.WarnFormat(
  229. "[WATCHDOG]: Requested to remove thread with ID {0} but this is not being monitored", threadID);
  230. return false;
  231. }
  232. }
  233. }
  234. public static bool AbortThread(int threadID)
  235. {
  236. lock (m_threads)
  237. {
  238. if (m_threads.ContainsKey(threadID))
  239. {
  240. ThreadWatchdogInfo twi = m_threads[threadID];
  241. twi.Thread.Abort();
  242. RemoveThread(threadID);
  243. return true;
  244. }
  245. else
  246. {
  247. return false;
  248. }
  249. }
  250. }
  251. private static void UpdateThread(int threadID)
  252. {
  253. ThreadWatchdogInfo threadInfo;
  254. // Although TryGetValue is not a thread safe operation, we use a try/catch here instead
  255. // of a lock for speed. Adding/removing threads is a very rare operation compared to
  256. // UpdateThread(), and a single UpdateThread() failure here and there won't break
  257. // anything
  258. try
  259. {
  260. if (m_threads.TryGetValue(threadID, out threadInfo))
  261. {
  262. threadInfo.LastTick = Environment.TickCount & Int32.MaxValue;
  263. threadInfo.IsTimedOut = false;
  264. }
  265. else
  266. {
  267. m_log.WarnFormat("[WATCHDOG]: Asked to update thread {0} which is not being monitored", threadID);
  268. }
  269. }
  270. catch { }
  271. }
  272. /// <summary>
  273. /// Get currently watched threads for diagnostic purposes
  274. /// </summary>
  275. /// <returns></returns>
  276. public static ThreadWatchdogInfo[] GetThreadsInfo()
  277. {
  278. lock (m_threads)
  279. return m_threads.Values.ToArray();
  280. }
  281. /// <summary>
  282. /// Return the current thread's watchdog info.
  283. /// </summary>
  284. /// <returns>The watchdog info. null if the thread isn't being monitored.</returns>
  285. public static ThreadWatchdogInfo GetCurrentThreadInfo()
  286. {
  287. lock (m_threads)
  288. {
  289. if (m_threads.ContainsKey(Thread.CurrentThread.ManagedThreadId))
  290. return m_threads[Thread.CurrentThread.ManagedThreadId];
  291. }
  292. return null;
  293. }
  294. /// <summary>
  295. /// Check watched threads. Fire alarm if appropriate.
  296. /// </summary>
  297. /// <param name="sender"></param>
  298. /// <param name="e"></param>
  299. private static void WatchdogTimerElapsed(object sender, System.Timers.ElapsedEventArgs e)
  300. {
  301. if(!m_enabled)
  302. return;
  303. int now = Environment.TickCount & Int32.MaxValue;
  304. int msElapsed = now - LastWatchdogThreadTick;
  305. if (msElapsed > WATCHDOG_INTERVAL_MS * 2)
  306. m_log.WarnFormat(
  307. "[WATCHDOG]: {0} ms since Watchdog last ran. Interval should be approximately {1} ms",
  308. msElapsed, WATCHDOG_INTERVAL_MS);
  309. LastWatchdogThreadTick = Environment.TickCount & Int32.MaxValue;
  310. Action<ThreadWatchdogInfo> callback = OnWatchdogTimeout;
  311. if (callback != null)
  312. {
  313. List<ThreadWatchdogInfo> callbackInfos = null;
  314. List<ThreadWatchdogInfo> threadsToRemove = null;
  315. const ThreadState thgone = ThreadState.Stopped;
  316. lock (m_threads)
  317. {
  318. foreach(ThreadWatchdogInfo threadInfo in m_threads.Values)
  319. {
  320. if(!m_enabled)
  321. return;
  322. if((threadInfo.Thread.ThreadState & thgone) != 0)
  323. {
  324. if(threadsToRemove == null)
  325. threadsToRemove = new List<ThreadWatchdogInfo>();
  326. threadsToRemove.Add(threadInfo);
  327. /*
  328. if(callbackInfos == null)
  329. callbackInfos = new List<ThreadWatchdogInfo>();
  330. callbackInfos.Add(threadInfo);
  331. */
  332. }
  333. else if(!threadInfo.IsTimedOut && now - threadInfo.LastTick >= threadInfo.Timeout)
  334. {
  335. threadInfo.IsTimedOut = true;
  336. if(threadInfo.AlarmIfTimeout)
  337. {
  338. if(callbackInfos == null)
  339. callbackInfos = new List<ThreadWatchdogInfo>();
  340. // Send a copy of the watchdog info to prevent race conditions where the watchdog
  341. // thread updates the monitoring info after an alarm has been sent out.
  342. callbackInfos.Add(new ThreadWatchdogInfo(threadInfo));
  343. }
  344. }
  345. }
  346. if(threadsToRemove != null)
  347. foreach(ThreadWatchdogInfo twi in threadsToRemove)
  348. RemoveThread(twi.Thread.ManagedThreadId);
  349. }
  350. if(callbackInfos != null)
  351. foreach (ThreadWatchdogInfo callbackInfo in callbackInfos)
  352. callback(callbackInfo);
  353. }
  354. if (MemoryWatchdog.Enabled)
  355. MemoryWatchdog.Update();
  356. ChecksManager.CheckChecks();
  357. StatsManager.RecordStats();
  358. m_watchdogTimer.Start();
  359. }
  360. }
  361. }