These examples are adapted from the online book Python For Everyone Chapter 11, authored by C. R. Severance. https://www.py4e.com/html3/11-regex
The sample data are some emails. https://www.py4e.com/code3/mbox-short.txt
# Import the regular expression package in python
import re
# Search for lines that contain 'From'
hand = open('mbox-short.txt') # remember to change the filepath to the path on your computer
for line in hand:
line = line.rstrip() #remove the white spaces etc at the end of the line
if re.search('From:', line):
print(line)
# Search for lines that start with 'From'
line = "edu From: rjlowe@iupui.edu"
if re.search('^From:', line):
print("line starts with 'From:'")
else:
print("line does not start with 'From:'")
# Search for lines that start with From and have an at sign
line = "From: rjlowe@iupui.edu"
if re.search('^From:.+@', line): # "." can match any character, "+" means match at least one character, "*" means match zero to more chracters
print("line starts with From and have an at sign")
else:
print("line does not contain the pattern")
# Extract data using regular expression
# Extracting email addresses with a non-whitespace string before @ and another after @
# In regular expression, the pattern is \S+@\S+
# "@2PM" does not match with this regular expression because there is no string before @
line = 'A message from csev@umich.edu to cwen@iupui.edu about meeting @2PM'
items = re.findall('\S+@\S+', line) #"\S" means a non-whitespace character
print(items)
# Search and extract data using regular expression
# If you are only interested in the string after @, but you need the entire pattern to find it,
# you can add parentheses around the string using regular expression \S+@(\S+).
# It will return the string after @
line = 'A message from csev@umich.edu to cwen@iupui.edu about meeting @2PM'
items = re.findall('\S+@(\S+)', line) #"\S" means a non-whitespace character
print(items)
# common regular expressions
# '^' - beginning of line
# '$' - end of line
# '.' - any character
# '\d' - one digit number
# '*' - zero or more occurrences
# '+' - one or more occurrences
# '\S' - non-whitespace character
# '[a-z]' - all lowercase letters
# '[A-Z]' - all uppercase letters
line = "123abc456DEF"
# find the entire line
items = re.findall('^.*$', line)
print(items)
# find all numbers
items = re.findall('(\d+)', line)
print(items)
# find all strings that begin with one or more digits and end with one or more letters
items = re.findall('(\d+[a-zA-Z]+)', line)
print(items)
# find all strings that begin with one or more digits and end with one or more lowercase letters
# the answer is '123abc'
line = "123abc456DEF"
# your code starts here
items = re.findall('(\d+[a-z]+)', line)
print(items)
# your code ends here
# find the digits at the beginning of the line
# the answer is '123'
line = "123abc456DEF"
# your code starts here
items = re.findall('(^\d+)', line)
print(items)
# your code ends here
# find the digits between letters
# the answer is '456'
line = "123abc456DEF"
# your code starts here
items = re.findall('[a-z](\d+)', line)
print(items)
# your code ends here