Apache Beam Select Top N rows from PCollection in Go - go

I'am having a PCollection from which I need to choose n largest rows. I'am trying to create a Dataflow pipeline using Go and stuck at this.
package main
import (
"context"
"flag"
"fmt"
"github.com/apache/beam/sdks/v2/go/pkg/beam"
"github.com/apache/beam/sdks/v2/go/pkg/beam/log"
"github.com/apache/beam/sdks/v2/go/pkg/beam/x/beamx"
)
type User struct {
Name string
Age int
}
func printRow(ctx context.Context, list User) {
fmt.Println(list)
}
func main() {
flag.Parse()
beam.Init()
ctx := context.Background()
p := beam.NewPipeline()
s := p.Root()
var userList = []User{
{"Bob", 5},
{"Adam", 8},
{"John", 3},
{"Ben", 1},
{"Jose", 1},
{"Bryan", 1},
{"Kim", 1},
{"Tim", 1},
}
initial := beam.CreateList(s, userList)
pc2 := beam.ParDo(s, func(row User, emit func(User)) {
emit(row)
}, initial)
beam.ParDo0(s, printRow, pc2)
if err := beamx.Run(ctx, p); err != nil {
log.Exitf(ctx, "Failed to execute job: %v", err)
}
}
From the above code I need to choose top 5 rows based on User.Age
I found the link top package which has a function does the same but it says it returns a single element PCollection. How is it different?
package main
import (
"context"
"flag"
"fmt"
"github.com/apache/beam/sdks/v2/go/pkg/beam"
"github.com/apache/beam/sdks/v2/go/pkg/beam/log"
"github.com/apache/beam/sdks/v2/go/pkg/beam/transforms/top"
"github.com/apache/beam/sdks/v2/go/pkg/beam/x/beamx"
)
func init() {
beam.RegisterFunction(less)
}
type User struct {
Name string
Age int
}
func printRow(ctx context.Context, list User) {
fmt.Println(list)
}
func less(a, b User) bool {
return a.Age < b.Age
}
func main() {
flag.Parse()
beam.Init()
ctx := context.Background()
p := beam.NewPipeline()
s := p.Root()
var userList = []User{
{"Bob", 5},
{"Adam", 8},
{"John", 3},
{"Ben", 1},
{"Jose", 1},
{"Bryan", 1},
{"Kim", 1},
{"Tim", 1},
}
initial := beam.CreateList(s, userList)
best := top.Largest(s, initial, 5, less)
pc2 := beam.ParDo(s, func(row User, emit func(User)) {
emit(row)
}, best)
beam.ParDo0(s, printRow, pc2)
if err := beamx.Run(ctx, p); err != nil {
log.Exitf(ctx, "Failed to execute job: %v", err)
}
}
I added the function to select the top 5 rows like above, but I get an error []main.User is not assignable to main.User
I need the PCollection in the same format as before since I have further processing to do. I suspect this is because the top.Largest function is returning a single-element PCollection. Any ideas on how I can convert the format?

best PCollection is []User
so try...
pc2 := beam.ParDo(s, func(rows []User, emit func(User)) {
for _, row := range rows {
emit(row)
}
}, best)

Related

Decode part of the json message and determine the type of the rest based on this [duplicate]

This question already has answers here:
Decoding incoming JSON in Golang based on param values
(2 answers)
How to decode JSON based on object type in Golang
(1 answer)
Closed 5 months ago.
I get a message like
type message struct {
Type string `json:"type"`
Data interface{} `json:"data"`
}
The Data type depends on the Type. Simply put, I want to get the Type from the message, swith over it, and depending on the result of json.Unmarshal(Data) into a specific structure. But it doesn’t work that way, because when I Unmarshal this message, Data immediately turns into a map[string]interface and then I can no longer turn it into a structure normally (well, either I have to Unmarshal 2 times). How can this problem be properly solved?
Then do two-staged decoding—exploiting the fact encoding/json has a special type RawMessage with the semantics "just save the sequence of bytes representing this value as is":
package main
import (
"encoding/json"
"fmt"
)
type message struct {
Type string `json:"type"`
Data json.RawMessage `json:"data"`
}
type A struct {
Foo string
Bar string
}
type B struct {
Quux int
Blorb []int
}
func decodeMessage(b []byte) (interface{}, error) {
var m message
err := json.Unmarshal(b, &m)
if err != nil {
return nil, err
}
switch m.Type {
case "a":
var a A
err = json.Unmarshal(m.Data, &a)
if err != nil {
return nil, err
}
return a, nil
case "b":
var b B
err = json.Unmarshal(m.Data, &b)
if err != nil {
return nil, err
}
return b, nil
}
return nil, fmt.Errorf("cannot handle type: %s", m.Type)
}
const msgA = `{
"type": "a",
"data": {
"Foo": "xyzzy",
"Bar": "r0xx0rz"
}
}`
const msgB = `{
"type": "b",
"data": {
"Quux": 42,
"Blorb": [1, 2, 3, 4]
}
}`
const msgX = `{
"type": "x",
"data": null
}`
func main() {
for _, s := range []string{msgA, msgB, msgX} {
d, err := decodeMessage([]byte(s))
fmt.Println(d, err)
}
}
Playground.
Another variant which is a bit different:
package main
import (
"encoding/json"
"fmt"
)
type message struct {
Type string `json:"type"`
Data json.RawMessage `json:"data"`
}
type A struct {
Foo string
Bar string
}
type B struct {
Quux int
Blorb []int
}
func decodeMessage(b []byte) (interface{}, error) {
var m message
err := json.Unmarshal(b, &m)
if err != nil {
return nil, err
}
var v interface{}
switch m.Type {
case "a":
v = &A{}
case "b":
v = &B{}
default:
return nil, fmt.Errorf("cannot handle type: %s", m.Type)
}
err = json.Unmarshal(m.Data, v)
if err != nil {
return nil, err
}
return v, nil
}
const msgA = `{
"type": "a",
"data": {
"Foo": "xyzzy",
"Bar": "r0xx0rz"
}
}`
const msgB = `{
"type": "b",
"data": {
"Quux": 42,
"Blorb": [1, 2, 3, 4]
}
}`
const msgX = `{
"type": "x",
"data": null
}`
func main() {
for _, s := range []string{msgA, msgB, msgX} {
d, err := decodeMessage([]byte(s))
fmt.Println(d, err)
}
}
Playground.

Is it possible to assert types dynamically in golang?

I have a method Deduplicate that returns deduplicated copy of passed in slice as an interface{}. Is there a way to cast returned by this method interface{} value to the same type as I passed in this method without writing it explicitly? For example, if I change myStruct.RelatedIDs type from []int to []uint it will prevent code from compiling.
https://play.golang.org/p/8OT4xYZuwEn
package main
import (
"fmt"
"reflect"
)
type myStruct struct {
ID int
RelatedIDs []int
}
func main() {
s := &myStruct{
ID: 42,
RelatedIDs: []int{1, 1, 2, 3},
}
v, _ := Deduplicate(s.RelatedIDs)
s.RelatedIDs = v.([]int) // << can I assert type dynamically here?
// s.RelatedIDs = v.(reflect.TypeOf(s.RelatedIDs)) // does not work
fmt.Printf("%#v\n", s.RelatedIDs)
}
func Deduplicate(slice interface{}) (interface{}, error) {
if reflect.TypeOf(slice).Kind() != reflect.Slice {
return nil, fmt.Errorf("slice has wrong type: %T", slice)
}
s := reflect.ValueOf(slice)
res := reflect.MakeSlice(s.Type(), 0, s.Len())
seen := make(map[interface{}]struct{})
for i := 0; i < s.Len(); i++ {
v := s.Index(i)
if _, ok := seen[v.Interface()]; ok {
continue
}
seen[v.Interface()] = struct{}{}
res = reflect.Append(res, v)
}
return res.Interface(), nil
}
Try this
package main
import (
"fmt"
"reflect"
)
type myStruct struct {
ID int
RelatedIDs []int
}
func main() {
s := &myStruct{
ID: 42,
RelatedIDs: []int{1, 1, 2, 3},
}
err := Deduplicate(&s.RelatedIDs)
fmt.Println(err)
// s.RelatedIDs = v.([]int) // << can I assert type dynamically here?
// s.RelatedIDs = v.(reflect.TypeOf(s.RelatedIDs)) // does not work
fmt.Printf("%#v\n", s.RelatedIDs)
}
func Deduplicate(slice interface{}) error {
rts := reflect.TypeOf(slice)
rtse := rts.Elem()
if rts.Kind() != reflect.Ptr && rtse.Kind() != reflect.Slice {
return fmt.Errorf("slice has wrong type: %T", slice)
}
rvs := reflect.ValueOf(slice)
rvse := rvs.Elem()
seen := make(map[interface{}]struct{})
var e int
for i := 0; i < rvse.Len(); i++ {
v := rvse.Index(i)
if _, ok := seen[v.Interface()]; ok {
continue
}
seen[v.Interface()] = struct{}{}
rvse.Index(e).Set(v)
e++
}
rvse.SetLen(e)
rvs.Elem().Set(rvse)
return nil
}
https://play.golang.org/p/hkEW4u1aGUi
with future generics, it might look like this https://go2goplay.golang.org/p/jobI5wKR8fU
For completeness, here is a generic version (Playground), which doesn't require reflection. Only open in February 2022! The usual caveats about NaN apply, though.
package main
import (
"fmt"
)
func main() {
s := []int{1, 1, 2, 3}
res := Deduplicate(s)
fmt.Printf("%#v\n", res)
}
func Deduplicate[T comparable](s []T) []T {
seen := make(map[T]struct{})
res := make([]T, 0, len(s))
for _, elem := range s {
if _, exists := seen[elem]; exists {
continue
}
seen[elem] = struct{}{}
res = append(res, elem)
}
return res
}
Output:
[]int{1, 2, 3}

How to remove duplicates in an interface array

Lets say the input can contain string or integer values
names = ["rahul", "rohit","srujan", "rahul"] --> output = ["rahul", "rohit","srujan"]
age=[12,18,12,21] --> output = [12,18,21]
we can make use of this function to filter duplicates
package main
import (
"fmt"
)
func unique(intSlice []int) []int {
keys := make(map[int]bool)
list := []int{}
for _, entry := range intSlice {
if _, value := keys[entry]; !value {
keys[entry] = true
list = append(list, entry)
}
}
return list
}
func main() {
intSlice := []int{1,5,3,6,9,9,4,2,3,1,5}
fmt.Println(intSlice)
uniqueSlice := unique(intSlice)
fmt.Println(uniqueSlice)
}
This works only if the input is either string or integer but not both
How to make sure this function works for array interface
Use the reflect package to write a function that works with any slice type:
func unique(src interface{}) interface{} {
srcv := reflect.ValueOf(src)
dstv := reflect.MakeSlice(srcv.Type(), 0, 0)
visited := make(map[interface{}]struct{})
for i := 0; i < srcv.Len(); i++ {
elemv := srcv.Index(i)
if _, ok := visited[elemv.Interface()]; ok {
continue
}
visited[elemv.Interface()] = struct{}{}
dstv = reflect.Append(dstv, elemv)
}
return dstv.Interface()
}
Use it like this:
uniqueIntSlice := unique(intSlice).([]int)
Run the code on the Go Playground.
How to make sure this function works for a (unsorted) slice of empty interface{}
Considered that empty interface{} are comparable (https://stackoverflow.com/a/54003329/4466350)
Thus, to answer your question, it is very simple to rewrite your original code
package main
import (
"fmt"
)
func main() {
intSlice := []interface{}{1, 5, 3, 6, 9, 9, 4, 2, 3, 1, 5}
fmt.Println(unique(intSlice))
}
func unique(src []interface{}) []interface{} {
keys := make(map[interface{}]bool)
list := []interface{}{}
for _, entry := range src {
if _, value := keys[entry]; !value {
keys[entry] = true
list = append(list, entry)
}
}
return list
}
https://play.golang.org/p/vW7vgwz9yc1
If your question become, how to remove duplicates of any slice type, please check this other answer https://stackoverflow.com/a/65191679/4466350
Nothing elegant and very prone to errors, but you can us a function that receives two interface{} arguments, the first one is the slice to filter and the second is a pointer to the filtered slice, obviously if the first parameter is a slice of int, the second one MUST be s pointer to slice of int.
Inside the function you can check for the types of the parameters and treat them separately.
package main
import (
"fmt"
)
func unique(slice interface{}, filtered interface{}) error {
// Check for slice of string
if sliceOfString, ok := slice.([]string); ok {
// If slice is slice of string filtered MUST also be slice of string
filteredAsSliceOfString, ok := filtered.(*[]string)
if !ok {
return fmt.Errorf("filtered should be of type %T, got %T instead", &[]string{}, filtered)
}
keys := make(map[string]bool)
for _, entry := range sliceOfString {
if _, value := keys[entry]; !value {
keys[entry] = true
*filteredAsSliceOfString = append(*filteredAsSliceOfString, entry)
}
}
}else if sliceOfInt, ok := slice.([]int); ok {
// If slice is slice of int filtered MUST also be slice of int
filteredAsInt, ok := filtered.(*[]int)
if !ok {
return fmt.Errorf("filtered should be of type %T, got %T instead", &[]string{}, filtered)
}
keys := make(map[int]bool)
for _, entry := range sliceOfInt {
if _, value := keys[entry]; !value {
keys[entry] = true
*filteredAsInt = append(*filteredAsInt, entry)
}
}
} else {
return fmt.Errorf("only slice of in or slice of string is supported")
}
return nil
}
func main() {
intSlice := []int{1,5,3,6,9,9,4,2,3,1,5}
intSliceFiltered := make([]int, 0)
stringSlice := []string{"a", "b", "b", "c", "c", "c", "d"}
stringSliceFiltered := make([]string, 0)
fmt.Println(intSlice)
err := unique(intSlice, &intSliceFiltered) // Very important to send pointer in second parameter
if err != nil {
fmt.Printf("error filtering int slice: %v\n", err)
}
fmt.Println(intSliceFiltered)
fmt.Println(stringSlice)
err = unique(stringSlice, &stringSliceFiltered) // Very important to send pointer in second parameter
if err != nil {
fmt.Printf("error filtering string slice: %v\n", err)
}
fmt.Println(stringSliceFiltered)
}
As I said, it's not elegant. I haven't check this for errors.
Here it is running.

ECDH in golang - elliptic/Curve unmarshal fails

Unfortunaltey there is no built-in brainpool support in go so I'm trying to get ECDH working with the help of a fork from keybase.
Maybe I'm making a newbie mistake here but by code is falling at the first hurdle (i.e. elliptic.Unmarshal returns nil) ?
package main
import (
"fmt"
"io/ioutil"
"log"
"encoding/pem"
"crypto/ecdsa"
"crypto/rand"
"github.com/keybase/go-crypto/brainpool"
"crypto/elliptic"
"crypto/sha256"
)
func main() {
fmt.Println("Hello")
content, err := ioutil.ReadFile("/tmp/TEST.pem")
if err != nil {
log.Fatal(err)
}
fmt.Printf("File contents: %s", content)
block, _ := pem.Decode(content)
if block == nil || block.Type != "PUBLIC KEY" {
log.Fatal("failed to decode PEM")
}
x,y := elliptic.Unmarshal(brainpool.P512r1(),block.Bytes)
if x == nil {
log.Fatal("failed to unmarshal")
}
pubb := ecdsa.PublicKey {brainpool.P512r1(),x,y}
priva, _ := ecdsa.GenerateKey(brainpool.P512r1(), rand.Reader)
b, _ := pubb.Curve.ScalarMult(pubb.X, pubb.Y, priva.D.Bytes())
shared1 := sha256.Sum256(b.Bytes())
fmt.Printf("\nShared key %x\n", shared1)
Update to show the test key:
-----BEGIN PUBLIC KEY-----
MIGbMBQGByqGSM49AgEGCSskAwMCCAEBDQOBggAEM/zOLT7nMN374k902oTRZXnG
97DPzvqi8QQJaKXcq1BSrU/sNeUhOi6Y+hBcr7ZE+WZDYNoQkaMNrdhF+3x1XGx7
BTBFL3U1w2ENmkIPiDa2o0Q/wpSOLo/RFabdK5Q3/yvq0hoSdXlpKozE7UTre5cU
bJcUzjXvs9KDLEq54Fs=
-----END PUBLIC KEY-----
You are trying to unmarshal the raw ASN.1 message, not the public key part. You should first unmarshal the ASN.1 block, then unmarshal the EC data.
I was wrong in my comment, Go doesn't support the Brainpool functions. So I borrowed some code from their X509 package to build a custom parser for the 6 curves in the Brainpool package.
Playground: https://play.golang.org/p/i-Zd4mTugjU
package main
import (
"crypto/ecdsa"
"crypto/elliptic"
"crypto/rand"
"crypto/sha256"
"crypto/x509/pkix"
"encoding/asn1"
"encoding/pem"
"errors"
"fmt"
"log"
"math/big"
"github.com/keybase/go-crypto/brainpool"
)
const ecKey = `-----BEGIN PUBLIC KEY-----
MIGbMBQGByqGSM49AgEGCSskAwMCCAEBDQOBggAEM/zOLT7nMN374k902oTRZXnG
97DPzvqi8QQJaKXcq1BSrU/sNeUhOi6Y+hBcr7ZE+WZDYNoQkaMNrdhF+3x1XGx7
BTBFL3U1w2ENmkIPiDa2o0Q/wpSOLo/RFabdK5Q3/yvq0hoSdXlpKozE7UTre5cU
bJcUzjXvs9KDLEq54Fs=
-----END PUBLIC KEY-----`
func main() {
block, _ := pem.Decode([]byte(ecKey))
if block == nil || block.Type != "PUBLIC KEY" {
log.Fatal("failed to decode PEM")
}
pub, err := parseBrainpoolPKIXPublicKey(block.Bytes)
if err != nil {
log.Fatalf("failed to parse key: %v", err)
}
pubb := ecdsa.PublicKey{Curve: brainpool.P512r1(), X: pub.X, Y: pub.Y}
priva, _ := ecdsa.GenerateKey(brainpool.P512r1(), rand.Reader)
b, _ := pubb.Curve.ScalarMult(pubb.X, pubb.Y, priva.D.Bytes())
shared1 := sha256.Sum256(b.Bytes())
fmt.Printf("\nShared key %x\n", shared1)
}
type publicKeyInfo struct {
Raw asn1.RawContent
Algorithm pkix.AlgorithmIdentifier
PublicKey asn1.BitString
}
type pkcs1PublicKey struct {
N *big.Int
E int
}
var (
oidNamedCurveBrainpoolP256r1 = asn1.ObjectIdentifier{1, 3, 36, 3, 3, 2, 8, 1, 1, 7}
oidNamedCurveBrainpoolP256t1 = asn1.ObjectIdentifier{1, 3, 36, 3, 3, 2, 8, 1, 1, 8}
oidNamedCurveBrainpoolP384r1 = asn1.ObjectIdentifier{1, 3, 36, 3, 3, 2, 8, 1, 1, 11}
oidNamedCurveBrainpoolP384t1 = asn1.ObjectIdentifier{1, 3, 36, 3, 3, 2, 8, 1, 1, 12}
oidNamedCurveBrainpoolP512r1 = asn1.ObjectIdentifier{1, 3, 36, 3, 3, 2, 8, 1, 1, 13}
oidNamedCurveBrainpoolP512t1 = asn1.ObjectIdentifier{1, 3, 36, 3, 3, 2, 8, 1, 1, 14}
)
func parseBrainpoolPKIXPublicKey(derBytes []byte) (pub *ecdsa.PublicKey, err error) {
var pki publicKeyInfo
if rest, err := asn1.Unmarshal(derBytes, &pki); err != nil {
if _, err := asn1.Unmarshal(derBytes, &pkcs1PublicKey{}); err == nil {
return nil, errors.New("failed to parse public key")
}
return nil, err
} else if len(rest) != 0 {
return nil, errors.New("trailing data after ASN.1 of public-key")
}
if !pki.Algorithm.Algorithm.Equal(asn1.ObjectIdentifier{1, 2, 840, 10045, 2, 1}) {
return nil, errors.New("not an ECDSA public key")
}
return parseBrainpoolPublicKey(&pki)
}
func parseBrainpoolPublicKey(keyData *publicKeyInfo) (*ecdsa.PublicKey, error) {
asn1Data := keyData.PublicKey.RightAlign()
paramsData := keyData.Algorithm.Parameters.FullBytes
namedCurveOID := new(asn1.ObjectIdentifier)
rest, err := asn1.Unmarshal(paramsData, namedCurveOID)
if err != nil {
return nil, errors.New("failed to parse ECDSA parameters as named curve")
}
if len(rest) != 0 {
return nil, errors.New("trailing data after ECDSA parameters")
}
namedCurve := namedCurveFromOID(*namedCurveOID)
if namedCurve == nil {
return nil, errors.New("unsupported elliptic curve")
}
x, y := elliptic.Unmarshal(namedCurve, asn1Data)
if x == nil {
return nil, errors.New("failed to unmarshal elliptic curve point")
}
pub := &ecdsa.PublicKey{
Curve: namedCurve,
X: x,
Y: y,
}
return pub, nil
}
func namedCurveFromOID(oid asn1.ObjectIdentifier) elliptic.Curve {
switch {
case oid.Equal(oidNamedCurveBrainpoolP256r1):
return brainpool.P256r1()
case oid.Equal(oidNamedCurveBrainpoolP256t1):
return brainpool.P256t1()
case oid.Equal(oidNamedCurveBrainpoolP384r1):
return brainpool.P384r1()
case oid.Equal(oidNamedCurveBrainpoolP384t1):
return brainpool.P384t1()
case oid.Equal(oidNamedCurveBrainpoolP512r1):
return brainpool.P512r1()
case oid.Equal(oidNamedCurveBrainpoolP512t1):
return brainpool.P512t1()
}
return nil
}

How to construct an $or query in mgo

I am trying to convert this JS MongoDB query into Go mgo query:
var foo = "bar";
db.collection.find({"$or": [ {uuid: foo}, {name: foo} ] });
This is what I've got so far, but it doesn't work:
conditions := bson.M{"$or": []bson.M{bson.M{"uuid": name}, bson.M{"name": name}}}
EDIT: It does seem to work now. Maybe I had a typo somewhere.
Here is a complete example which works fine for me (with Go 1.4, and MongoDB 2.6.5)
package main
import (
"fmt"
"log"
"gopkg.in/mgo.v2"
"gopkg.in/mgo.v2/bson"
)
type Person struct {
Num int
Uuid string
Name string
}
func main() {
// Connect to the database
session, err := mgo.Dial("localhost")
if err != nil {
panic(err)
}
defer session.Close()
// Remove people collection if any
c := session.DB("test").C("people")
c.DropCollection()
// Add some data
err = c.Insert(&Person{ 1, "UUID1", "Joe"},
&Person{ 2, "UUID2", "Jane"},
&Person{ 3, "UUID3", "Didier" })
if err != nil {
log.Fatal(err)
}
result := Person{}
err = c.Find( bson.M{ "$or": []bson.M{ bson.M{"uuid":"UUID0"}, bson.M{"name": "Joe"} } } ).One(&result)
if err != nil {
log.Fatal(err)
}
fmt.Println(result)
}
package main
import (
"fmt"
"gopkg.in/mgo.v2"
"gopkg.in/mgo.v2/bson"
)
func GetAll()[]interface{}{
//Data Base Instance
session, err := mgo.Dial("localhost")
c := session.DB("yourDbName").C("YourCollectionName")
foo := "bar"
orQuery := []bson.M{}
uidFindQuery := bson.M{uuid: foo}
nameFindQuery := bson.M{name: foo}
orQuery = append(orQuery, uidFindQuery, nameFindQuery)
result := []interface{}{}
err := c.Find(bson.M{"$or":orQuery}).All(&result);
fmt.Println("error", err)
fmt.Println("Your expected Data",result)
return result
} `
Reference
https://gopkg.in/mgo.v2 Best Mongo db interface for go lang

Resources