Parsing instagram with java jsoup not give Elements gives source

Tags: , , ,



I’m trying to get reels video URL with jsoup using java on Android Studio. I want to get Elements in inspect but code returns page source. I use jsoup in other projects on different web pages and never encounter this situation. Can you tell me what ı doing wrong and how can ı get the Elements in inspect? Thank you

  public class fetchData extends AsyncTask<Void, Void, Void> {
        Document doc = null;
        String str;

        @Override
        protected void onPostExecute(Void aVoid) {
            super.onPostExecute(aVoid);
            MainActivity.textView.setText(str);
        }
    
        @Override
        protected Void doInBackground(Void... voids) {
            try {
                doc = Jsoup.connect("https://www.instagram.com/reel/CDok74FJzHp/?igshid=cam8ylb7okl7").get();
            } catch (IOException e) {
                e.printStackTrace();
            }
            str = doc.toString();
            return null;
        }
}

Answer

If you check the source of the page (inspect the video element) you’ll find:

<video class="tWeCl"
  playsinline="" 
  poster="https://instagram.flhr4-2.fna.fbcdn.net/v/t51.2885-15/e35/117157253_120443486171759_7332785595039685871_n.jpg?_nc_ht=instagram.flhr4-2.fna.fbcdn.net&amp;_nc_cat=111&amp;_nc_ohc=aX7rVh9IbGoAX_lj74j&amp;oh=ba74c5c8ad97ba14c35710addd523dfd&amp;oe=5F363C59" 
  preload="none" 
  type="video/mp4" 
  src="https://instagram.flhr4-2.fna.fbcdn.net/v/t50.2886-16/117284962_313567919762486_3343704909021624596_n.mp4?_nc_ht=instagram.flhr4-2.fna.fbcdn.net&amp;_nc_cat=102&amp;_nc_ohc=3wvoN4vNzkUAX_DLFTR&amp;oe=5F3659EF&amp;oh=7a38d593469a99239a7cb07050cc47f2">
</video>

If you then search the html for the mp4 url you’ll find it in one of the javascript html tags… it is delivered as a json value. So by breaking up the javascript text on the " = " and taking the latter half, you get the raw json which can then be parsed for the "video_url" using JayWay’s JsonPath.read method.

It would seem the video tag is therefore generated in the html by the javascript as it doesn’t appear possible to filter the html for any <video> elements.

import com.jayway.jsonpath.JsonPath;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

public class Instagram {

    private final String url;

    public Instagram(String url) {
        this.url = url;
    }

    public void start() {
        Document doc = getHtmlPage(url);
        Elements videoElement = getScriptElementContainingVideoUrl(doc);

        List<String> relevantTagWithMp4Url = getSingleScriptElementWithVideoUrl(videoElement);
        String scriptInnerHtml = relevantTagWithMp4Url.get(0);

        System.out.println("Video Url: " + getVideoUrl(scriptInnerHtml));
    }

    private List<String> getSingleScriptElementWithVideoUrl(Elements scriptElements) {
        List<String> relevantTagWithMp4Url = new ArrayList<>();

        for (Element element : scriptElements) {
            if (element.data().contains("mp4")) {
                relevantTagWithMp4Url.add(element.data());
            }
        }

        return relevantTagWithMp4Url;
    }

    private Elements getScriptElementContainingVideoUrl(Document doc) {
        return doc.select("script");
    }

    private String getVideoUrl(String videoElement) {
        String jsonResponse = videoElement.split(" = ")[1];
        // $.. is equivalent to $.[*] - (a wild card matcher) - you may need to play with this
        List<String> videoUrl = JsonPath.read(jsonResponse, "$..video_url");
        return videoUrl.get(0);
    }

    private Document getHtmlPage(String url) {
        try {
            return Jsoup.connect(url).get();
        } catch (IOException e) {
            e.printStackTrace();
        }
        return null;
    }


    public static void main(String[] args) {
        new Instagram("https://www.instagram.com/reel/CDok74FJzHp/?igshid=cam8ylb7okl7").start();
    }
}


Source: stackoverflow