What is Extract URLs present in a given string?

In this article, we will learn Extract URLs present in a given string,This free Data Structures and Algorithms tutorial for complete beginners will help you learn Data Structures and Algorithms from scratch.

Extract URLs present in a given string - ❤️Data Structures and Algorithms Tutorials In 2024

Given a string S, the task is to find and extract all the URLs from the string. If no URL is present in the string, then print “-1”.

Examples:

Input: S = “Welcome to https://www.w3wiki.net Computer Science Portal”
Output: https://www.w3wiki.net
Explanation:
The given string contains the URL ‘https://www.w3wiki.net’.

Input: S = “Welcome to https://write.w3wiki.net portal of https://www.w3wiki.net Computer Science Portal”
Output:
https://write.w3wiki.net
https://www.w3wiki.net
Explanation:
The given string contains two URLs ‘https://write.w3wiki.net’ and ‘https://www.w3wiki.net’.

Approach: The idea is to use Regular Expression to solve this problem. Follow the steps below to solve the given problem:

Create a regular expression to extract all the URLs from the string as mentioned below:

regex = “\\b((?:https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:, .;]*[-a-zA-Z0-9+&@#/%=~_|])”

Create an ArrayList in Java and compile the regular expression using Pattern.compile().
Match the given string with the regular expression. In Java, this can be done by using Pattern.matcher().
Find the substring from the first index of match result to the last index of the match result and add this substring into the list.
After completing the above steps, if the list is found to be empty, then print “-1” as there is no URL present in the string S. Otherwise, print all the string stored in the list.

Below is the implementation of the above approach:

C++

#include <iostream>
#include <regex>
#include <vector>
using namespace std;
 
// Function to extract all the URLs from the string
void extractURL(string str)
{
    // Creating an empty vector to store URLs
    vector<string> url_list;
 
    // Regular Expression to extract URLs from the string
    string regex_str = "\\b((?:https?|ftp|file):"
                       "\\/\\/[a-zA-Z0-9+&@#\\/%?=~_|!:,.;]*"
                       "[a-zA-Z0-9+&@#\\/%=~_|])";
 
    // Compile the Regular Expression
    regex r(regex_str, regex_constants::icase);
 
    // Find the match between string and the regular expression
    sregex_iterator m(str.begin(), str.end(), r);
    sregex_iterator m_end;
 
    // Find and store all the URLs in the vector
    while (m != m_end) {
        url_list.push_back(m->str());
        m++;
    }
 
    // If no URLs are found, print -1, otherwise print the URLs
    if (url_list.size() == 0) {
        cout << "-1" << endl;
    } else {
        for (string url : url_list) {
            cout << url << endl;
        }
    }
}
 
// Driver Code
int main()
{
    // Given String str
    string str = "Welcome to https://www.w3wiki.net Computer Science Portal";
 
    // Function Call
    extractURL(str);
 
    return 0;
}

Java

import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
 
public class ExtractURL {
 
    // Function to extract all the URLs from the string
    public static void extractURL(String str) {
        // Creating an empty ArrayList to store URLs
        ArrayList<String> urlList = new ArrayList<>();
 
        // Regular Expression to extract URL from the string
        String regexStr = "\\b((?:https?|ftp|file):"
                + "\\/\\/[a-zA-Z0-9+&@#\\/%?=~_|!:,.;]*"
                + "[a-zA-Z0-9+&@#\\/%=~_|])";
 
        // Compile the Regular Expression pattern
        Pattern pattern = Pattern.compile(regexStr, Pattern.CASE_INSENSITIVE);
 
        // Create a Matcher that matches the pattern with the input string
        Matcher matcher = pattern.matcher(str);
 
        // Find and add all matching URLs to the ArrayList
        while (matcher.find()) {
            // Add the matched URL to the ArrayList
            urlList.add(matcher.group());
        }
 
        // If no URL is found, print -1
        if (urlList.isEmpty()) {
            System.out.println("-1");
        } else {
            // Print all the URLs stored in the ArrayList
            for (String url : urlList) {
                System.out.println(url);
            }
        }
    }
 
    public static void main(String[] args) {
        // Given String str
        String str = "Welcome to https://www.w3wiki.net "
                + "Computer Science Portal";
 
        // Function Call
        extractURL(str);
    }
}

Python3

import re
 
def extractURL(str):
    # Creating an empty list
    url_list = []
     
    # Regular Expression to extract URL from the string
    regex = r'\b((?:https?|ftp|file):\/\/[-a-zA-Z0-9+&@#\/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#\/%=~_|])'
     
    # Compile the Regular Expression
    p = re.compile(regex, re.IGNORECASE)
     
    # Find the match between string and the regular expression
    m = p.finditer(str)
     
    # Find the next subsequence of the input subsequence that find the pattern
    for match in m:
        # Find the substring from the first index of match result to the last index of match result and add in the list
        url_list.append(str[match.start():match.end()])  # Corrected slicing here
     
    # IF there no URL present
    if len(url_list) == 0:
        print("-1")
        return
     
    # Print all the URLs stored
    for url in url_list:
        print(url)
 
# Driver Code
if __name__ == '__main__':
   
    # Given String str
    string = "Welcome to https://www.w3wiki.net Computer Science Portal"
 
    # Function Call
    extractURL(string)

C#

using System;
using System.Collections.Generic;
using System.Text.RegularExpressions;
 
class Program
{
    static void ExtractURL(string str)
    {
        // Create an empty list to store URLs
        List<string> urlList = new List<string>();
 
        // Regular Expression to extract URLs from the string
        string regexStr = @"\b((https?|ftp|file)://[a-zA-Z0-9+&@#/%?=~_|!:,.;]*[a-zA-Z0-9+&@#/%=~_|])";
 
        // Compile the Regular Expression
        Regex regex = new Regex(regexStr, RegexOptions.IgnoreCase);
 
        // Find all matches in the string
        MatchCollection matches = regex.Matches(str);
 
        foreach (Match match in matches)
        {
            urlList.Add(match.Value);
        }
 
        // If there are no URLs present
        if (urlList.Count == 0)
        {
            Console.WriteLine("-1");
            return;
        }
 
        // Print all the URLs stored
        foreach (string url in urlList)
        {
            Console.WriteLine(url);
        }
    }
 
    static void Main()
    {
        // Given String str
        string str = "Welcome to https://www.w3wiki.net Computer Science Portal";
 
        // Function Call
        ExtractURL(str);
    }
}

Javascript

// Function to extract all the URLs from the string
function extractURL(str) {
    // Creating an empty array to store URLs
    let urlList = [];
 
    // Regular Expression to extract URL from the string
    const regexStr = "\\b((?:https?|ftp|file):"  // Defines the URL pattern
        + "\\/\\/[a-zA-Z0-9+&@#\\/%?=~_|!:,.;]*"
        + "[a-zA-Z0-9+&@#\\/%=~_|])";
 
    // Compile the Regular Expression pattern
    const regex = new RegExp(regexStr, 'gi'); // 'g' flag for global match, 'i' for case-insensitive
 
    // Find and add all matching URLs to the array
    let match;
    while ((match = regex.exec(str)) !== null) { // Loop through matches and add to urlList
        urlList.push(match[0]);
    }
 
    // If no URL is found, print -1
    if (urlList.length === 0) {
        console.log("-1");
        return;
    }
 
    // Print all the URLs stored in the array
    for (let url of urlList) {
        console.log(url);
    }
}
 
// Given String str
const str = "Welcome to https://www.w3wiki.net Computer Science Portal";
 
// Function Call
extractURL(str);

Output

https://www.w3wiki.net

Time Complexity: O(N)
Auxiliary Space: O(1)