Skip to content
Advertisement

How to extract information from PCollection after a join in apache beam?

I have two example streams of data on which I perform innerJoin. I would like to extend this piece of example join code and add some logic after the join occurs

public class JoinExample {

  public static void main(String[] args) {
    final Pipeline pipeline = Pipeline.create(pipelineOpts);

    PCollection<Row> adStream =
        pipeline
            .apply(From.source("kafka.adStream"))
            .apply(Select.fieldNames("ad.id", "ad.name"))
            .apply(Window.into(FixedWindows.of(Duration.standardSeconds(5))));

    PCollection<Row> clickStream =
        pipeline
            .apply(From.source("kafka.clickStream"))
            .apply(Select.fieldNames("ad.id", "numClicks"))
            .apply(Window.into(FixedWindows.of(Duration.standardSeconds(5))));

    adStream
        .apply(Join.<Row, Row>innerJoin(clickStream).using("id"))
        .apply(ConsoleOutput.of(Row::toString)); // Instead of this output, I would like to just print the ad name and num clicks after the join

    pipeline.run();
  }

I would like to just print the ad name and num clicks after the join using a DoFcn like this:

 adStream
    .apply(Join.<Row, Row>innerJoin(clickStream).using("id"))
    .apply(ParDo.of(new DoFcn(PCollection<Row>, int>() {

      public void processElement(ProcessContext c) {
        // Since there are two rows after the join, how can I get info from each row?
        // Example in:
        //    ad.id = 1, ad.name = test
        //    ad.id = 1, numClicks = 1000
        
        // After join
        // Row: [Row:[1, test], Row:[1, 1000]]
        
        // I tried this statement but it is incorrect
        Row one = c.element.getRow(0);  // This API is not available
      }
     } 

Any ideas on how to extract this info from the joined data?

Advertisement

Answer

As you learned, the Schema Join method emulates the SQL join in which the result of the join is the concatenation of the rows from the joined PCollections. In order to see which rows went into the inner join you have to use the CoGroup utility to join the PCollections. This returns a Row object with individual iterables for each of the PCollections that contains Rows that match the key. Example:

import org.apache.beam.sdk.schemas.transforms.CoGroup;
import org.apache.beam.sdk.values.PCollectionTuple;

public class JoinExample {

  public static void main(String[] args) {
    final Pipeline pipeline = Pipeline.create(pipelineOpts);

    PCollection<Row> adStream =
        pipeline
            .apply(From.source("kafka.adStream"))
            .apply(Select.fieldNames("ad.id", "ad.name"))
            .apply(Window.into(FixedWindows.of(Duration.standardSeconds(5))));

    PCollection<Row> clickStream =
        pipeline
            .apply(From.source("kafka.clickStream"))
            .apply(Select.fieldNames("ad.id", "numClicks"))          
            .apply(Window.into(FixedWindows.of(Duration.standardSeconds(5))));

    // The names given here for the PCollections can be used to retrieve the
    // the rows in the consuming PTransform. See below:
    PCollectionTuple.of("adStream", adStream, "clickStream", clickStream)
      // This selects the common field name in both adStream and clickStream 
      // to join on. See the documentation for ways of joining on
      // different keys.
      .apply(CoGroup.join(By.fieldNames("id")))
      .apply(ParDo.of(new DoFn<Row, int>() {
        public void processElement(ProcessContext c) 

          // Get key.
          String id = c.element.getValue("key").id;

          // Get rows from the adStream and clickStream PCollections that 
          // share the same id.
          Iterable<Row> adStream = c.element.getValue("adStream");
          Iterable<Row> clickStream = c.element.getValue("clickStream");

          return 0;
        }
      }));

     pipeline.run();
  }
}
Advertisement