123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506 |
- /*
- * Copyright (c) Contributors, http://opensimulator.org/
- * See CONTRIBUTORS.TXT for a full list of copyright holders.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * * Neither the name of the OpenSimulator Project nor the
- * names of its contributors may be used to endorse or promote products
- * derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE DEVELOPERS ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
- using System;
- using System.Collections.Generic;
- using System.Linq;
- using System.Threading;
- using log4net;
- namespace OpenSim.Framework.Monitoring
- {
- /// <summary>
- /// Manages launching threads and keeping watch over them for timeouts
- /// </summary>
- public static class Watchdog
- {
- /// <summary>Timer interval in milliseconds for the watchdog timer</summary>
- public const double WATCHDOG_INTERVAL_MS = 2500.0d;
- /// <summary>Default timeout in milliseconds before a thread is considered dead</summary>
- public const int DEFAULT_WATCHDOG_TIMEOUT_MS = 5000;
- [System.Diagnostics.DebuggerDisplay("{Thread.Name}")]
- public class ThreadWatchdogInfo
- {
- public Thread Thread { get; private set; }
- /// <summary>
- /// Approximate tick when this thread was started.
- /// </summary>
- /// <remarks>
- /// Not terribly good since this quickly wraps around.
- /// </remarks>
- public int FirstTick { get; private set; }
- /// <summary>
- /// Last time this heartbeat update was invoked
- /// </summary>
- public int LastTick { get; set; }
- /// <summary>
- /// Number of milliseconds before we notify that the thread is having a problem.
- /// </summary>
- public int Timeout { get; set; }
- /// <summary>
- /// Is this thread considered timed out?
- /// </summary>
- public bool IsTimedOut { get; set; }
- /// <summary>
- /// Will this thread trigger the alarm function if it has timed out?
- /// </summary>
- public bool AlarmIfTimeout { get; set; }
- /// <summary>
- /// Method execute if alarm goes off. If null then no alarm method is fired.
- /// </summary>
- public Func<string> AlarmMethod { get; set; }
- /// <summary>
- /// Stat structure associated with this thread.
- /// </summary>
- public Stat Stat { get; set; }
- public ThreadWatchdogInfo(Thread thread, int timeout)
- {
- Thread = thread;
- Timeout = timeout;
- FirstTick = Environment.TickCount & Int32.MaxValue;
- LastTick = FirstTick;
- Stat
- = new Stat(
- thread.Name,
- string.Format("Last update of thread {0}", thread.Name),
- "",
- "ms",
- "server",
- "thread",
- StatType.Pull,
- MeasuresOfInterest.None,
- stat => stat.Value = Environment.TickCount & Int32.MaxValue - LastTick,
- StatVerbosity.Debug);
- StatsManager.RegisterStat(Stat);
- }
- public ThreadWatchdogInfo(ThreadWatchdogInfo previousTwi)
- {
- Thread = previousTwi.Thread;
- FirstTick = previousTwi.FirstTick;
- LastTick = previousTwi.LastTick;
- Timeout = previousTwi.Timeout;
- IsTimedOut = previousTwi.IsTimedOut;
- AlarmIfTimeout = previousTwi.AlarmIfTimeout;
- AlarmMethod = previousTwi.AlarmMethod;
- }
- public void Cleanup()
- {
- StatsManager.DeregisterStat(Stat);
- }
- }
- /// <summary>
- /// This event is called whenever a tracked thread is
- /// stopped or has not called UpdateThread() in time<
- /// /summary>
- public static event Action<ThreadWatchdogInfo> OnWatchdogTimeout;
- public static JobEngine JobEngine { get; private set; }
- /// <summary>
- /// Is this watchdog active?
- /// </summary>
- public static bool Enabled
- {
- get { return m_enabled; }
- set
- {
- // m_log.DebugFormat("[MEMORY WATCHDOG]: Setting MemoryWatchdog.Enabled to {0}", value);
- if (value == m_enabled)
- return;
- m_enabled = value;
- if (m_enabled)
- {
- // Set now so we don't get alerted on the first run
- LastWatchdogThreadTick = Environment.TickCount & Int32.MaxValue;
- }
- m_watchdogTimer.Enabled = m_enabled;
- }
- }
- private static bool m_enabled;
- private static readonly ILog m_log = LogManager.GetLogger(System.Reflection.MethodBase.GetCurrentMethod().DeclaringType);
- private static Dictionary<int, ThreadWatchdogInfo> m_threads;
- private static System.Timers.Timer m_watchdogTimer;
- /// <summary>
- /// Last time the watchdog thread ran.
- /// </summary>
- /// <remarks>
- /// Should run every WATCHDOG_INTERVAL_MS
- /// </remarks>
- public static int LastWatchdogThreadTick { get; private set; }
- static Watchdog()
- {
- JobEngine = new JobEngine();
- m_threads = new Dictionary<int, ThreadWatchdogInfo>();
- m_watchdogTimer = new System.Timers.Timer(WATCHDOG_INTERVAL_MS);
- m_watchdogTimer.AutoReset = false;
- m_watchdogTimer.Elapsed += WatchdogTimerElapsed;
- }
- /// <summary>
- /// Start a new thread that is tracked by the watchdog timer.
- /// </summary>
- /// <param name="start">The method that will be executed in a new thread</param>
- /// <param name="name">A name to give to the new thread</param>
- /// <param name="priority">Priority to run the thread at</param>
- /// <param name="isBackground">True to run this thread as a background thread, otherwise false</param>
- /// <param name="alarmIfTimeout">Trigger an alarm function is we have timed out</param>
- /// <param name="log">If true then creation of thread is logged.</param>
- /// <returns>The newly created Thread object</returns>
- public static Thread StartThread(
- ThreadStart start, string name, ThreadPriority priority, bool isBackground, bool alarmIfTimeout, bool log = true)
- {
- return StartThread(start, name, priority, isBackground, alarmIfTimeout, null, DEFAULT_WATCHDOG_TIMEOUT_MS, log);
- }
- /// <summary>
- /// Start a new thread that is tracked by the watchdog
- /// </summary>
- /// <param name="start">The method that will be executed in a new thread</param>
- /// <param name="name">A name to give to the new thread</param>
- /// <param name="priority">Priority to run the thread at</param>
- /// <param name="isBackground">True to run this thread as a background
- /// thread, otherwise false</param>
- /// <param name="alarmIfTimeout">Trigger an alarm function is we have timed out</param>
- /// <param name="alarmMethod">
- /// Alarm method to call if alarmIfTimeout is true and there is a timeout.
- /// Normally, this will just return some useful debugging information.
- /// </param>
- /// <param name="timeout">Number of milliseconds to wait until we issue a warning about timeout.</param>
- /// <param name="log">If true then creation of thread is logged.</param>
- /// <returns>The newly created Thread object</returns>
- public static Thread StartThread(
- ThreadStart start, string name, ThreadPriority priority, bool isBackground,
- bool alarmIfTimeout, Func<string> alarmMethod, int timeout, bool log = true)
- {
- Thread thread = new Thread(start);
- thread.Name = name;
- thread.Priority = priority;
- thread.IsBackground = isBackground;
-
- ThreadWatchdogInfo twi
- = new ThreadWatchdogInfo(thread, timeout)
- { AlarmIfTimeout = alarmIfTimeout, AlarmMethod = alarmMethod };
- if (log)
- m_log.DebugFormat(
- "[WATCHDOG]: Started tracking thread {0}, ID {1}", twi.Thread.Name, twi.Thread.ManagedThreadId);
- lock (m_threads)
- m_threads.Add(twi.Thread.ManagedThreadId, twi);
- thread.Start();
- return thread;
- }
- /// <summary>
- /// Run the callback in a new thread immediately. If the thread exits with an exception log it but do
- /// not propogate it.
- /// </summary>
- /// <param name="callback">Code for the thread to execute.</param>
- /// <param name="name">Name of the thread</param>
- /// <param name="obj">Object to pass to the thread.</param>
- public static void RunInThread(WaitCallback callback, string name, object obj, bool log = false)
- {
- if (Util.FireAndForgetMethod == FireAndForgetMethod.RegressionTest)
- {
- Culture.SetCurrentCulture();
- callback(obj);
- return;
- }
- ThreadStart ts = new ThreadStart(delegate()
- {
- try
- {
- Culture.SetCurrentCulture();
- callback(obj);
- Watchdog.RemoveThread(log:false);
- }
- catch (Exception e)
- {
- m_log.Error(string.Format("[WATCHDOG]: Exception in thread {0}.", name), e);
- }
- });
- StartThread(ts, name, ThreadPriority.Normal, true, false, log:log);
- }
- /// <summary>
- /// Marks the current thread as alive
- /// </summary>
- public static void UpdateThread()
- {
- UpdateThread(Thread.CurrentThread.ManagedThreadId);
- }
- /// <summary>
- /// Stops watchdog tracking on the current thread
- /// </summary>
- /// <param name="log">If true then normal events in thread removal are not logged.</param>
- /// <returns>
- /// True if the thread was removed from the list of tracked
- /// threads, otherwise false
- /// </returns>
- public static bool RemoveThread(bool log = true)
- {
- return RemoveThread(Thread.CurrentThread.ManagedThreadId, log);
- }
- private static bool RemoveThread(int threadID, bool log = true)
- {
- lock (m_threads)
- {
- ThreadWatchdogInfo twi;
- if (m_threads.TryGetValue(threadID, out twi))
- {
- if (log)
- m_log.DebugFormat(
- "[WATCHDOG]: Removing thread {0}, ID {1}", twi.Thread.Name, twi.Thread.ManagedThreadId);
- twi.Cleanup();
- m_threads.Remove(threadID);
- return true;
- }
- else
- {
- m_log.WarnFormat(
- "[WATCHDOG]: Requested to remove thread with ID {0} but this is not being monitored", threadID);
- return false;
- }
- }
- }
- public static bool AbortThread(int threadID)
- {
- lock (m_threads)
- {
- if (m_threads.ContainsKey(threadID))
- {
- ThreadWatchdogInfo twi = m_threads[threadID];
- twi.Thread.Abort();
- RemoveThread(threadID);
- return true;
- }
- else
- {
- return false;
- }
- }
- }
- private static void UpdateThread(int threadID)
- {
- ThreadWatchdogInfo threadInfo;
- // Although TryGetValue is not a thread safe operation, we use a try/catch here instead
- // of a lock for speed. Adding/removing threads is a very rare operation compared to
- // UpdateThread(), and a single UpdateThread() failure here and there won't break
- // anything
- try
- {
- if (m_threads.TryGetValue(threadID, out threadInfo))
- {
- threadInfo.LastTick = Environment.TickCount & Int32.MaxValue;
- threadInfo.IsTimedOut = false;
- }
- else
- {
- m_log.WarnFormat("[WATCHDOG]: Asked to update thread {0} which is not being monitored", threadID);
- }
- }
- catch { }
- }
-
- /// <summary>
- /// Get currently watched threads for diagnostic purposes
- /// </summary>
- /// <returns></returns>
- public static ThreadWatchdogInfo[] GetThreadsInfo()
- {
- lock (m_threads)
- return m_threads.Values.ToArray();
- }
- /// <summary>
- /// Return the current thread's watchdog info.
- /// </summary>
- /// <returns>The watchdog info. null if the thread isn't being monitored.</returns>
- public static ThreadWatchdogInfo GetCurrentThreadInfo()
- {
- lock (m_threads)
- {
- if (m_threads.ContainsKey(Thread.CurrentThread.ManagedThreadId))
- return m_threads[Thread.CurrentThread.ManagedThreadId];
- }
- return null;
- }
- /// <summary>
- /// Check watched threads. Fire alarm if appropriate.
- /// </summary>
- /// <param name="sender"></param>
- /// <param name="e"></param>
- private static void WatchdogTimerElapsed(object sender, System.Timers.ElapsedEventArgs e)
- {
- int now = Environment.TickCount & Int32.MaxValue;
- int msElapsed = now - LastWatchdogThreadTick;
- if (msElapsed > WATCHDOG_INTERVAL_MS * 2)
- m_log.WarnFormat(
- "[WATCHDOG]: {0} ms since Watchdog last ran. Interval should be approximately {1} ms",
- msElapsed, WATCHDOG_INTERVAL_MS);
- LastWatchdogThreadTick = Environment.TickCount & Int32.MaxValue;
- Action<ThreadWatchdogInfo> callback = OnWatchdogTimeout;
- if (callback != null)
- {
- List<ThreadWatchdogInfo> callbackInfos = null;
- lock (m_threads)
- {
- foreach (ThreadWatchdogInfo threadInfo in m_threads.Values)
- {
- if (threadInfo.Thread.ThreadState == ThreadState.Stopped)
- {
- RemoveThread(threadInfo.Thread.ManagedThreadId);
- if (callbackInfos == null)
- callbackInfos = new List<ThreadWatchdogInfo>();
- callbackInfos.Add(threadInfo);
- }
- else if (!threadInfo.IsTimedOut && now - threadInfo.LastTick >= threadInfo.Timeout)
- {
- threadInfo.IsTimedOut = true;
- if (threadInfo.AlarmIfTimeout)
- {
- if (callbackInfos == null)
- callbackInfos = new List<ThreadWatchdogInfo>();
- // Send a copy of the watchdog info to prevent race conditions where the watchdog
- // thread updates the monitoring info after an alarm has been sent out.
- callbackInfos.Add(new ThreadWatchdogInfo(threadInfo));
- }
- }
- }
- }
- if (callbackInfos != null)
- foreach (ThreadWatchdogInfo callbackInfo in callbackInfos)
- callback(callbackInfo);
- }
- if (MemoryWatchdog.Enabled)
- MemoryWatchdog.Update();
- ChecksManager.CheckChecks();
- StatsManager.RecordStats();
- m_watchdogTimer.Start();
- }
- /// <summary>
- /// Run a job.
- /// </summary>
- /// <remarks>
- /// This differs from direct scheduling (e.g. Util.FireAndForget) in that a job can be run in the job
- /// engine if it is running, where all jobs are currently performed in sequence on a single thread. This is
- /// to prevent observed overload and server freeze problems when there are hundreds of connections which all attempt to
- /// perform work at once (e.g. in conference situations). With lower numbers of connections, the small
- /// delay in performing jobs in sequence rather than concurrently has not been notiecable in testing, though a future more
- /// sophisticated implementation could perform jobs concurrently when the server is under low load.
- ///
- /// However, be advised that some callers of this function rely on all jobs being performed in sequence if any
- /// jobs are performed in sequence (i.e. if jobengine is active or not). Therefore, expanding the jobengine
- /// beyond a single thread will require considerable thought.
- ///
- /// Also, any jobs submitted must be guaranteed to complete within a reasonable timeframe (e.g. they cannot
- /// incorporate a network delay with a long timeout). At the moment, work that could suffer such issues
- /// should still be run directly with RunInThread(), Util.FireAndForget(), etc. This is another area where
- /// the job engine could be improved and so CPU utilization improved by better management of concurrency within
- /// OpenSimulator.
- /// </remarks>
- /// <param name="jobType">General classification for the job (e.g. "RezAttachments").</param>
- /// <param name="callback">Callback for job.</param>
- /// <param name="name">Specific name of job (e.g. "RezAttachments for Joe Bloggs"</param>
- /// <param name="obj">Object to pass to callback when run</param>
- /// <param name="canRunInThisThread">If set to true then the job may be run in ths calling thread.</param>
- /// <param name="mustNotTimeout">If the true then the job must never timeout.</param>
- /// <param name="log">If set to true then extra logging is performed.</param>
- public static void RunJob(
- string jobType, WaitCallback callback, string name, object obj,
- bool canRunInThisThread = false, bool mustNotTimeout = false,
- bool log = false)
- {
- if (Util.FireAndForgetMethod == FireAndForgetMethod.RegressionTest)
- {
- Culture.SetCurrentCulture();
- callback(obj);
- return;
- }
- if (JobEngine.IsRunning)
- JobEngine.QueueRequest(name, callback, obj);
- else if (canRunInThisThread)
- callback(obj);
- else if (mustNotTimeout)
- RunInThread(callback, name, obj, log);
- else
- Util.FireAndForget(callback, obj, name);
- }
- }
- }
|