PHP vs GO
Rad pouzivam PHP, protoze s nim delam dlouhodobe a umim s nim cokoliv potrebuju. Nejsem zadny supr dupr programator, kaslu na nejaky konvence a neumim pouzivat hi-tech moderni metody, ale proste si udelam co chci 🙂 Obcas ale PHP zklame svym vykonem a tak jsem pred nedavnem s nekteryma vecma presel na Go.
Aktualne resim, ze mam k dispozici accesslog z proxy a nxd webserveru, v JSON formatu, ale neobsahuje informace ktery potrebuji, musim je tedy doplnit. Problem je vsak velikost souboru. Z NXD jde o hodinove logy o velikosti 2-4GB (4-9M zaznamu), u proxy jde o denni logy o velikosti 80-100GB (100M+ zaznamu).
Priklad takovy logu je zde:
{"size":"736","content-type":"text\/plain","host":"support0.bigo.sg","domain":"bigo.sg","tld":"sg","referer":null,"refdomain":"","reftld":"","ua":"BigoLive-Android 8911978509991121548","clientip":"124.123.167.250","method":"POST","timestamp":1609415999,"uri":"\/stats?encrypt=1","geoip":"IN"} {"size":"","content-type":"","host":"mraid.bigo.sg","domain":"bigo.sg","tld":"sg","referer":"http:\/\/mraid.bigo.sg\/","refdomain":"http:\/\/mraid.bigo.sg\/","reftld":"bigo.sg\/","ua":"Mozilla\/5.0 (Linux; Android 9; KSA-LX9 Build\/HONORKSA-LX9; wv) AppleWebKit\/537.36 (KHTML, like Gecko) Version\/4.0 Chrome\/87.0.4280.101 Mobile Safari\/537.36","clientip":"213.230.79.35","method":"GET","timestamp":1609415999,"uri":"\/favicon.ico","geoip":"UZ"} {"size":"1312","content-type":"text\/plain","host":"support0.bigo.sg","domain":"bigo.sg","tld":"sg","referer":null,"refdomain":"","reftld":"","ua":"BigoLive-Android 2267138772082122218","clientip":"117.194.191.92","method":"POST","timestamp":1609415999,"uri":"\/stats?encrypt=1","geoip":"IN"}
S logem potrebuju nalozit takto:
- doplnit informace o prohlizeci z UA
- doplnit a opravit informace o domene v refereru
Mohl bych to sice rovnou ukladat uz s temito daty, problem ale je, ze browscap.ini obsahuje pozadovany informace jen v ty nejvetsi variante a funkce get_browser() pak trva prilis dlouho, rapidne to zvysi zatez webserveru a projevilo se mi to na monetizaci trafficu propadem az na tretinovy castky, coz je neprimerena dan. Proto jsem se rozhodl tyto data doplnit az zpetne.
Mel jsem na to napsany jednoduchy PHP kod:
<?php use MaxMind\Db\Reader; require 'vendor/autoload.php'; $fileCache = new \Doctrine\Common\Cache\FilesystemCache("/users/domainname/vendor/browscap/browscap-php/resources/"); $cache = new \Roave\DoctrineSimpleCache\SimpleCacheAdapter($fileCache); $logger = new \Monolog\Logger('name'); $bc = new \BrowscapPHP\Browscap($cache, $logger); include "effectiveTLDs.inc.php"; include "regDomain.inc.php"; function truefalse($in){ if($in == true){ $out = 1; } else { $out = 0; } return $out; } function GetTLD($domain, $real = FALSE) { if (!$real){ $tld = strtolower(substr($domain, strpos($domain, ".") + 1)); $domain = strtolower(substr($domain, 0, strpos($domain, "."))); $sld = strtolower(substr($domain, strpos($domain, ".") + 1)); } else { $tld = strtolower(substr($domain, strrpos($domain, ".") + 1)); } return $tld; } $file = $argv[1]; $handle = fopen($file, "r"); if ($handle) { while (($line = fgets($handle)) !== false) { $log = json_decode($line,1); if($uacache[$log['ua']]){ $browser = $uacache[$log['ua']]; } else { $browser = $bc->getBrowser($log['ua']); $uacache[$log['ua']] = $browser; } if($log['referer']){ $parse = parse_url($log['referer']); $log['refdomain'] = getRegisteredDomain(strtolower($parse['host'])); $log['reftld'] = GetTLD($log['refdomain']); } $log['browser']['ua'] = $log['ua']; $log['browser']['os'] = $browser->platform; $log['browser']['os_ver'] = $browser->platform_version; $log['browser']['os_description'] = $browser->platform_description; $log['browser']['type'] = $browser->browser_type; $log['browser']['browser'] = $browser–>browser; $log['browser']['version'] = $browser->version; $log['browser']['device_name'] = $browser->device_name; $log['browser']['device_type'] = $browser->device_type; $log['browser']['device_name'] = $browser->device_name; $log['browser']['device_name'] = $browser->device_name; $log['browser']['device_poiting'] = $browser->device_pointing_method; $log['browser']['cookies'] = truefalse($browser->cookies); $log['browser']['javascript'] = truefalse($browser->javascript); echo json_encode($log)."\n"; } } ?>
Ale trapila me jeho performance. Kdyz zkusime zpracovat log s 500.000 radky (235MB):
# time php convert.php test.json > test2.json real 1m25.426s user 1m12.178s sys 0m12.984s
Zkusil jsem script tedy prepsat do Go, kde navic jsem schopny poustet processy paralelne ve vice vlaknech:
package main import ( "bufio" "os" "log" "fmt" "net/url" "sync" "encoding/json" bgo "github.com/digitalcrab/browscap_go" dname "github.com/weppos/publicsuffix-go/publicsuffix" ) var wg sync.WaitGroup var sem = make(chan struct{}, 8000) type BrowserJSON struct { Browser string `json:"browser"` BrowserVersion string `json:"version"` BrowserType string `json:"type"` Platform string `json:"os_description"` PlatformShort string `json:"os"` PlatformVersion string `json:"os_ver"` DeviceType string `json:"device_type"` DeviceName string `json:"device_name"` Cookies string `json:"cookies"` JavaScript string `json:"javascript"` } func parse_line(line string){ var result map[string]interface{} e := json.Unmarshal([]byte(line), &result) if e == nil { ua := fmt.Sprint(result["ua"]) if ua != "" { browser, ok := bgo.GetBrowser(ua) if ok { result["browser"] = BrowserJSON{browser.Browser,browser.BrowserVersion,browser.BrowserType,browser.Platform,browser.PlatformShort,browser.PlatformVersion,browser.DeviceType,browser.DeviceName,browser.Cookies,browser.JavaScript} } } referer := fmt.Sprint(result["referer"]) if referer != "" { url,err := url.Parse(referer) if err == nil { dn, err := dname.ParseFromListWithOptions(dname.DefaultList, url.Host, &dname.FindOptions{IgnorePrivate: true}) if err == nil { result["refdomain"] = dn.SLD + "." + dn.TLD result["reftld"] = dn.TLD } } } data, _ := json.Marshal(result) fmt.Println(string(data)) } defer wg.Done() <-sem } func main() { filename := os.Args[1] f, err := os.Open(filename) if err != nil { log.Println(err) } defer f.Close() s := bufio.NewScanner(f) if err := bgo.InitBrowsCap("browscap.ini", false); err != nil { panic(err) } for s.Scan() { // Add 1 to wg counter wg.Add(1) // Send Signal into channel sem <- struct{}{} // Start go routine go parse_line(s.Text()) } // blocks until the WaitGroup counter is zero. wg.Wait() close(sem) }
Vysledek na stejnym souboru byl nasledujici:
# time ./dnname test.json > test2.json real 0m11.452s user 1m36.110s sys 0m6.978s
Srovnani je tedy neporovnatelne … 1 minuta 25s pro PHP, zatimco 11s pro Go … a to nejdele trvalo nacteni browscap.ini … tedy u vetsich souboru bude ten rozdil jeste vice znat.
Tak to bylo jen takove male okenko proc rad pouzivam Go na zpracovavani velkych souboru 🙂
PS: koukam ze mi code block rozmrdal taby … 🙁