Franta – Občasník malého ajťáka

Domény, Hosting, Cestování

PHP vs GO

Rad pouzivam PHP, protoze s nim delam dlouhodobe a umim s nim cokoliv potrebuju. Nejsem zadny supr dupr programator, kaslu na nejaky konvence a neumim pouzivat hi-tech moderni metody, ale proste si udelam co chci 🙂 Obcas ale PHP zklame svym vykonem a tak jsem pred nedavnem s nekteryma vecma presel na Go.

Aktualne resim, ze mam k dispozici accesslog z proxy a nxd webserveru, v JSON formatu, ale neobsahuje informace ktery potrebuji, musim je tedy doplnit. Problem je vsak velikost souboru. Z NXD jde o hodinove logy o velikosti 2-4GB (4-9M zaznamu), u proxy jde o denni logy o velikosti 80-100GB (100M+ zaznamu).

Priklad takovy logu je zde:

{"size":"736","content-type":"text\/plain","host":"support0.bigo.sg","domain":"bigo.sg","tld":"sg","referer":null,"refdomain":"","reftld":"","ua":"BigoLive-Android 8911978509991121548","clientip":"124.123.167.250","method":"POST","timestamp":1609415999,"uri":"\/stats?encrypt=1","geoip":"IN"}
{"size":"","content-type":"","host":"mraid.bigo.sg","domain":"bigo.sg","tld":"sg","referer":"http:\/\/mraid.bigo.sg\/","refdomain":"http:\/\/mraid.bigo.sg\/","reftld":"bigo.sg\/","ua":"Mozilla\/5.0 (Linux; Android 9; KSA-LX9 Build\/HONORKSA-LX9; wv) AppleWebKit\/537.36 (KHTML, like Gecko) Version\/4.0 Chrome\/87.0.4280.101 Mobile Safari\/537.36","clientip":"213.230.79.35","method":"GET","timestamp":1609415999,"uri":"\/favicon.ico","geoip":"UZ"}
{"size":"1312","content-type":"text\/plain","host":"support0.bigo.sg","domain":"bigo.sg","tld":"sg","referer":null,"refdomain":"","reftld":"","ua":"BigoLive-Android 2267138772082122218","clientip":"117.194.191.92","method":"POST","timestamp":1609415999,"uri":"\/stats?encrypt=1","geoip":"IN"}

S logem potrebuju nalozit takto:

  • doplnit informace o prohlizeci z UA
  • doplnit a opravit informace o domene v refereru

Mohl bych to sice rovnou ukladat uz s temito daty, problem ale je, ze browscap.ini obsahuje pozadovany informace jen v ty nejvetsi variante a funkce get_browser() pak trva prilis dlouho, rapidne to zvysi zatez webserveru a projevilo se mi to na monetizaci trafficu propadem az na tretinovy castky, coz je neprimerena dan. Proto jsem se rozhodl tyto data doplnit az zpetne.

Mel jsem na to napsany jednoduchy PHP kod:

<?php
use MaxMind\Db\Reader;

require 'vendor/autoload.php';

$fileCache = new \Doctrine\Common\Cache\FilesystemCache("/users/domainname/vendor/browscap/browscap-php/resources/");
$cache = new \Roave\DoctrineSimpleCache\SimpleCacheAdapter($fileCache);
$logger = new \Monolog\Logger('name');

$bc = new \BrowscapPHP\Browscap($cache, $logger);

include "effectiveTLDs.inc.php";
include "regDomain.inc.php";

function truefalse($in){

    if($in == true){
 $out = 1;
    } else {
 $out = 0;
    }

    return $out;

}

function GetTLD($domain, $real = FALSE) {
        if (!$real){
                $tld = strtolower(substr($domain, strpos($domain, ".") + 1));
         $domain = strtolower(substr($domain, 0, strpos($domain, ".")));
                $sld = strtolower(substr($domain, strpos($domain, ".") + 1));
        } else {
                $tld = strtolower(substr($domain, strrpos($domain, ".") + 1));
 }

        return $tld;
}

$file = $argv[1];

$handle = fopen($file, "r");

if ($handle) {
    while (($line = fgets($handle)) !== false) {

    $log = json_decode($line,1);

    if($uacache[$log['ua']]){
 $browser = $uacache[$log['ua']];
    } else {
 $browser = $bc->getBrowser($log['ua']);
 $uacache[$log['ua']] = $browser;
    }

    if($log['referer']){
 $parse = parse_url($log['referer']);
 $log['refdomain'] = getRegisteredDomain(strtolower($parse['host']));
 $log['reftld'] = GetTLD($log['refdomain']);
    }

    $log['browser']['ua'] = $log['ua'];
    $log['browser']['os'] = $browser->platform;
    $log['browser']['os_ver'] = $browser->platform_version;
    $log['browser']['os_description'] = $browser->platform_description;
    $log['browser']['type'] = $browser->browser_type;
    $log['browser']['browser'] = $browser–>browser;
    $log['browser']['version'] = $browser->version;
    $log['browser']['device_name'] = $browser->device_name;
    $log['browser']['device_type'] = $browser->device_type;
    $log['browser']['device_name'] = $browser->device_name;
    $log['browser']['device_name'] = $browser->device_name;
    $log['browser']['device_poiting'] = $browser->device_pointing_method;
    $log['browser']['cookies'] = truefalse($browser->cookies);
    $log['browser']['javascript'] = truefalse($browser->javascript);

    echo json_encode($log)."\n";

}
}

?>

Ale trapila me jeho performance. Kdyz zkusime zpracovat log s 500.000 radky (235MB):

# time php convert.php test.json > test2.json

real	1m25.426s
user	1m12.178s
sys	0m12.984s

Zkusil jsem script tedy prepsat do Go, kde navic jsem schopny poustet processy paralelne ve vice vlaknech:

package main

import (
 "bufio"
 "os"
 "log"
 "fmt"
 "net/url"
 "sync"
 "encoding/json"
 bgo "github.com/digitalcrab/browscap_go"
 dname "github.com/weppos/publicsuffix-go/publicsuffix"
)

var wg sync.WaitGroup
var sem = make(chan struct{}, 8000)

type BrowserJSON struct {
    Browser         	string	`json:"browser"`
    BrowserVersion 	string	`json:"version"`
    BrowserType 	string	`json:"type"`
    Platform        	string	`json:"os_description"`
    PlatformShort   	string	`json:"os"`
    PlatformVersion 	string	`json:"os_ver"`
    DeviceType  	string	`json:"device_type"`
    DeviceName  	string	`json:"device_name"`
    Cookies    		string	`json:"cookies"`
    JavaScript 		string	`json:"javascript"`
}

func parse_line(line string){

    var result map[string]interface{}

    e := json.Unmarshal([]byte(line), &result)

    if e == nil {

 ua := fmt.Sprint(result["ua"])

        if ua != "" {
  browser, ok := bgo.GetBrowser(ua)
  if ok {
      result["browser"] = BrowserJSON{browser.Browser,browser.BrowserVersion,browser.BrowserType,browser.Platform,browser.PlatformShort,browser.PlatformVersion,browser.DeviceType,browser.DeviceName,browser.Cookies,browser.JavaScript}
  }
 }

 referer := fmt.Sprint(result["referer"])

 if referer != "" {

     url,err := url.Parse(referer)

     if err == nil {

  dn, err := dname.ParseFromListWithOptions(dname.DefaultList, url.Host, &dname.FindOptions{IgnorePrivate: true})

  if err == nil {
      result["refdomain"] = dn.SLD + "." + dn.TLD
      result["reftld"] = dn.TLD
  }
     }

 }

        data, _ := json.Marshal(result)
        fmt.Println(string(data))

    }

    defer wg.Done()
    <-sem

}

func main() {

        filename := os.Args[1]
        f, err := os.Open(filename)
        if err != nil {
            log.Println(err)
        }
        defer f.Close()

        s := bufio.NewScanner(f)

 if err := bgo.InitBrowsCap("browscap.ini", false); err != nil {
  panic(err)
 }

        for s.Scan() {

                // Add 1 to wg counter
                wg.Add(1)

                // Send Signal into channel
                sem <- struct{}{}

                // Start go routine
                go parse_line(s.Text())
        }

        // blocks until the WaitGroup counter is zero.
        wg.Wait()
        close(sem)
}

Vysledek na stejnym souboru byl nasledujici:

# time ./dnname test.json > test2.json

real	0m11.452s
user	1m36.110s
sys	0m6.978s

Srovnani je tedy neporovnatelne … 1 minuta 25s pro PHP, zatimco 11s pro Go … a to nejdele trvalo nacteni browscap.ini … tedy u vetsich souboru bude ten rozdil jeste vice znat.

Tak to bylo jen takove male okenko proc rad pouzivam Go na zpracovavani velkych souboru 🙂

PS: koukam ze mi code block rozmrdal taby … 🙁

Napsat komentář

Vaše e-mailová adresa nebude zveřejněna. Vyžadované informace jsou označeny *

Tato stránka používá Akismet k omezení spamu. Podívejte se, jak vaše data z komentářů zpracováváme..