1
0
Эх сурвалжийг харах

Changing to use BatchSetAsync; attempting to deal with excessive memory issues.

Gwyneth Llewelyn 6 жил өмнө
parent
commit
1a196640ff
2 өөрчлөгдсөн 74 нэмэгдсэн , 53 устгасан
  1. 2 2
      README.md
  2. 72 51
      gosl.go

+ 2 - 2
README.md

@@ -43,7 +43,7 @@ You can run the executable either as:
 	-dir string
 		Directory where database files are stored (default "slkvdb")
 	-import string
-		(experimental) Import database from W-Hat (use the csv.bz2 version)
+		Import database from W-Hat (use the csv.bz2 version)
 	-nomemory
 		Attempt to use only disk to save memory (important for shared webservers)
 	-port string
@@ -79,7 +79,7 @@ Note that the current version can be used as a direct replacement for [W-Hat's n
 
 To actually _use_ the W-Hat database, you need to download it first and import it. This means using the `-import` command (use the `name2key.csv.bz2`version). W-Hat still updates that database daily, so, with some clever `cron` magic, you might be able to get a fresh copy every day to import. Note that the database is supposed to be unique by name (and the UUIDs are not supposed to change): that means that you can import the 'new' version over an 'old' version, and only the relevant entries will be changed. Also, if you happen to have captured new entries (not yet existing on W-Hat's database) then these will _not_ be overwritten (or deleted) with a new import. To delete an old database, just delete the directory it is in.
 
-Importing the whole W-Hat database, which has a bit over 9 million entries, took on my Mac 3 minutes and 5 seconds. Aye, that's quite a long time. On a shared server, it can be even longer. The code has been substantially changed to use `BatchSet` which is allegedly the recommended way of importing large databases, but even in the scenario to consume as little memory as possible, it will break most shared servers, simply because Go's garbage collector will not be fast enough to clean up after each batch is sent — I may have to take a look at how to do this better, perhaps with less concurrency.
+Importing the whole W-Hat database, which has a bit over 9 million entries, took on my Mac 1 minute and 38 seconds. Aye, that's quite a long time. On a shared server, it can be even longer. The code has been substantially changed to use `BatchSet` which is allegedly the recommended way of importing large databases, but even in the scenario to consume as little memory as possible, it will break most shared servers, simply because Go's garbage collector will not be fast enough to clean up after each batch is sent — I may have to take a look at how to do this better, perhaps with less concurrency.
 
 This also works for OpenSimulator grids and you can use the same scripts and database if you wish. Currently, the database stores an extra field with UUID, which is usually set to the name of the grid. Linden Lab sets these as 'Production' and 'Testing' respectively; other grid operators may use other names. There is no guarantee that every grid operator has configured their database with an unique name. Also, because of the way the key/value database works, it _assumes_ that all avatar names are unique across _all_ grids, which may not be true: only UUIDs are guaranteed to be unique. The reason why this was implemented this way was that I wanted _very fast_ name2key searches, while I'm not worried about very slow key2name searches, since those are implemented in LSL anyway. To make sure that you can have many avatars with the same name but different UUIDs, well, it would require a different kind of database. Also, what should a function return for an avatar with three different UUIDs? You can see the problem there. Therefore I kept it simple, but be mindful of this limitation.
 

+ 72 - 51
gosl.go

@@ -21,6 +21,7 @@ import (
 	"regexp"
 	"runtime"
 	"strings"
+	"sync"
 	"time"
 )
 
@@ -40,6 +41,7 @@ type avatarUUID struct {
 } 
 
 var BATCH_BLOCK = 100000	// how many entries to write to the database as a block; the bigger, the faster, but the more memory it consumes
+							// the authors say that 100000 is way too much
 
 /*
 				  .__			 
@@ -49,7 +51,9 @@ var BATCH_BLOCK = 100000	// how many entries to write to the database as a block
 |__|_|	(____  /__|___|	 /
 	  \/	 \/			  \/ 
 */
- 
+
+var noMemory *bool
+
 // main() starts here.
 func main() {
 	// Flag setup
@@ -58,7 +62,7 @@ func main() {
 	var isServer = flag.Bool("server", false, "Run as server on port " + *myPort)
 	var isShell  = flag.Bool("shell", false, "Run as an interactive shell")
 	var importFilename = flag.String("import", "", "Import database from W-Hat (use the csv.bz2 version)")
-	var noMemory = flag.Bool("nomemory", false, "Attempt to use only disk to save memory (important for shared webservers)")
+	noMemory = flag.Bool("nomemory", false, "Attempt to use only disk to save memory (important for shared webservers)")
 	
 	// default is FastCGI
 
@@ -118,10 +122,14 @@ func main() {
 	}
 	Opt.Dir = *myDir
 	Opt.ValueDir = Opt.Dir
+	//Opt.TableLoadingMode = options.MemoryMap
+	Opt.TableLoadingMode = options.FileIO
+
 	if *noMemory {
 //		Opt.TableLoadingMode = options.FileIO // use standard file I/O operations for tables instead of LoadRAM
-		Opt.TableLoadingMode = options.MemoryMap // MemoryMap indicates that that the file must be memory-mapped - https://github.com/dgraph-io/badger/issues/224#issuecomment-329643771
-		BATCH_BLOCK = 10000	// try to import less at each time, it will take longer but hopefully work
+//		Opt.TableLoadingMode = options.MemoryMap // MemoryMap indicates that that the file must be memory-mapped - https://github.com/dgraph-io/badger/issues/224#issuecomment-329643771
+		Opt.TableLoadingMode = options.FileIO
+		BATCH_BLOCK = 1000	// try to import less at each time, it will take longer but hopefully work
 		log.Info("Trying to avoid too much memory consumption")	
 	}
 	kv, err := badger.NewKV(&Opt)
@@ -282,8 +290,12 @@ func searchKVUUID(avatarKey string) (name string, grid string) {
 	kv, err := badger.NewKV(&Opt)
 	checkErr(err) // should probably panic
 	itOpt := badger.DefaultIteratorOptions
-	itOpt.PrefetchValues = true
-	itOpt.PrefetchSize = 1000	// attempt to get this a little bit more efficient; we have many small entries, so this is not too much
+	if !*noMemory {
+		itOpt.PrefetchValues = true
+		itOpt.PrefetchSize = 1000	// attempt to get this a little bit more efficient; we have many small entries, so this is not too much
+	} else {
+		itOpt.PrefetchValues = false
+	}
 	itr := kv.NewIterator(itOpt)
 	var val = avatarUUID{ NullUUID, "" }
 	var found string
@@ -312,20 +324,47 @@ func searchKVUUID(avatarKey string) (name string, grid string) {
 //	One could theoretically set a cron job to get this file, save it on disk periodically, and keep the database up-to-date
 //	see https://stackoverflow.com/questions/24673335/how-do-i-read-a-gzipped-csv-file for the actual usage of these complicated things!
 func importDatabase(filename string) {
-	f, err := os.Open(filename)
+	filehandler, err := os.Open(filename)
 	if err != nil {
 		log.Fatal(err)
 	}
-	defer f.Close()
-	gr := bzip2.NewReader(f) // open bzip2 reader
+	defer filehandler.Close()
+	gr := bzip2.NewReader(filehandler) // open bzip2 reader
 	cr := csv.NewReader(gr)  // open csv reader and feed the bzip2 reader into it
+	
+	// prepare connection to KV database
 	kv, err := badger.NewKV(&Opt)
 	checkErrPanic(err) // should probably panic		
 	defer kv.Close()
-	time_start := time.Now()
-	var oneEntry badger.Entry	// make sure this one has at least some memory assigned to it
+	time_start := time.Now() // we want to get an idea on how long this takes
+	
+	// prepare stuff for asynchronous key reading; see https://godoc.org/github.com/dgraph-io/badger#KV.BatchSetAsync
+	wg := new(sync.WaitGroup)
+
+//	var oneEntry badger.Entry	// make sure this one has at least some memory assigned to it
 	limit := 0	// outside of for loop so that we can count how many entries we had in total
-	var batch = make([]badger.Entry, BATCH_BLOCK) // this is the actual data, or else we get pointers to nil
+	batch := make([]*badger.Entry, 0, BATCH_BLOCK) // this is the actual data
+
+	// define a error function for async reads
+	f := func(err error) {
+	    defer wg.Done()
+	    if err != nil {
+	        // At this point you can retry writing keys or send error over a channel to handle
+	        // in some other goroutine.
+	        log.Errorf("Got error: %+v\n", err)
+	    }
+	
+	    // Check for error in entries which could be non-nil if the user supplies a CasCounter.
+	    for _, e := range batch {
+	        if e.Error != nil {
+	            log.Errorf("Got error: %+v\n", e.Error)
+	        }
+	    }
+	
+	    // You can do cleanup now. Like deleting keys from cache.
+	    log.Info("All async sets complete.")
+	}
+
 	for ;;limit++ {
 		record, err := cr.Read()
 		if err == io.EOF {
@@ -338,58 +377,40 @@ func importDatabase(filename string) {
 		if err != nil {
 			log.Warning(err)
 		} else {
-			oneEntry = badger.Entry{ Key: []byte(record[1]), Value: []byte(jsonNewEntry)}
-			batch[limit % BATCH_BLOCK] = oneEntry
+//			oneEntry = badger.Entry{ Key: []byte(record[1]), Value: []byte(jsonNewEntry)}
+			batch = append(batch, &badger.Entry{
+				Key:   []byte(record[1]),
+				Value: []byte(jsonNewEntry),
+    		})
 		}
 		if limit % BATCH_BLOCK == 0 && limit != 0 { // we do not run on the first time, and then only every BATCH_BLOCK times
 			log.Debug("Processing:", limit)
-			go writeOneBatch(kv, batch)
+//			go writeOneBatch(kv, batch)
+			wg.Add(1)
+			kv.BatchSetAsync(batch, f)
+			log.Debug("Finished")
+			wg.Wait()
 			batch = nil // clear all entries, start a new batch
 			runtime.GC()
-			batch = make([]badger.Entry, BATCH_BLOCK)
+			batch = make([]*badger.Entry, 0, BATCH_BLOCK)
 		}
 	}
-	writeOneBatch(kv, batch)	// NOTE(gwyneth): these are the final ones (i.e. from the last round number of BATCH_BLOCK up to the end) 
-								// and we do not run them as goroutine because the function might terminate and close our channel before
-								// this finishes
-								// BUG(gwyneth): if BATCH_BLOCK is too small, or we finish exactly on the modulo, we may still have a
-								// racing condition! 
+	// NOTE(gwyneth): these are the final ones (i.e. from the last round number of BATCH_BLOCK up to the end) 
+	// and we do not run them as goroutine because the function might terminate and close our channel before
+	// this finishes
+	// BUG(gwyneth): if BATCH_BLOCK is too small, or we finish exactly on the modulo, we may still have a
+	// racing condition! 
+	wg.Add(1)
+	kv.BatchSetAsync(batch, f)
+	log.Debug("Finished last batch")
+	wg.Wait()
 	batch = nil // flag the garbage collector that we are finished with this array
+	runtime.GC()
 	time_end := time.Now()
 	diffTime := time_end.Sub(time_start)
 	log.Info("Total read", limit, "records (or thereabouts) in", diffTime)
 }
 
-// writeOneBatch copies all entries in this batch into the correct kind of array to push them out as a batch.
-// Since this is called as a goroutine, there will be thousands of those batches around!
-func writeOneBatch(kv *badger.KV, batch []badger.Entry) {
-	if kv == nil {
-		log.Panic("kv should NEVER be nil")
-		return
-	}
-	if batch == nil || len(batch) == 0 {
-		log.Panic("batch should NEVER be nil or have zero elements")
-		return
-	}
-	time_start := time.Now()
-	var entries = make([]*badger.Entry, BATCH_BLOCK)	// prepare entries for BatchSet
-	for i := 0; i < BATCH_BLOCK; i++ {
-		entries[i] = &batch[i]
-	}
-	if entries == nil || len(entries) == 0{
-		log.Panic("entries should NEVER be nil or have zero elements")
-		return
-	}
-	checkErr(kv.BatchSet(entries))
-		for _, e := range entries {
-		checkErr(e.Error)
-	}
-	entries = nil // flag the garbage collector that we're finished with this array
-	runtime.GC()
-	time_end := time.Now()
-	diffTime := time_end.Sub(time_start)
-	log.Debug("goroutine sent a batch of", BATCH_BLOCK, "records in", diffTime)
-}
 
 // NOTE(gwyneth):Auxiliary functions which I'm always using...