{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Python Regular Expression Examples"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "These examples are adapted from the online book Python For Everyone Chapter 11, authored by C. R. Severance.\n",
    "https://www.py4e.com/html3/11-regex\n",
    "\n",
    "The sample data are some emails.\n",
    "https://www.py4e.com/code3/mbox-short.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import the regular expression package in python\n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "From: stephen.marquard@uct.ac.za\n",
      "From: louis@media.berkeley.edu\n",
      "From: zqian@umich.edu\n",
      "From: rjlowe@iupui.edu\n",
      "From: zqian@umich.edu\n",
      "From: rjlowe@iupui.edu\n",
      "From: cwen@iupui.edu\n",
      "From: cwen@iupui.edu\n",
      "From: gsilver@umich.edu\n",
      "From: gsilver@umich.edu\n",
      "From: zqian@umich.edu\n",
      "From: gsilver@umich.edu\n",
      "From: wagnermr@iupui.edu\n",
      "From: zqian@umich.edu\n",
      "From: antranig@caret.cam.ac.uk\n",
      "From: gopal.ramasammycook@gmail.com\n",
      "From: david.horwitz@uct.ac.za\n",
      "From: david.horwitz@uct.ac.za\n",
      "From: david.horwitz@uct.ac.za\n",
      "From: david.horwitz@uct.ac.za\n",
      "From: stephen.marquard@uct.ac.za\n",
      "From: louis@media.berkeley.edu\n",
      "From: louis@media.berkeley.edu\n",
      "From: ray@media.berkeley.edu\n",
      "From: cwen@iupui.edu\n",
      "From: cwen@iupui.edu\n",
      "From: cwen@iupui.edu\n"
     ]
    }
   ],
   "source": [
    "# Search for lines that contain 'From'\n",
    "hand = open('mbox-short.txt') # remember to change the filepath to the path on your computer\n",
    "for line in hand:\n",
    "    line = line.rstrip() #remove the white spaces etc at the end of the line\n",
    "    if re.search('From:', line):\n",
    "        print(line)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "line does not start with 'From:'\n"
     ]
    }
   ],
   "source": [
    "# Search for lines that start with 'From'\n",
    "line = \"edu From: rjlowe@iupui.edu\"\n",
    "if re.search('^From:', line):\n",
    "    print(\"line starts with 'From:'\")\n",
    "else:\n",
    "    print(\"line does not start with 'From:'\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "line starts with From and have an at sign\n"
     ]
    }
   ],
   "source": [
    "# Search for lines that start with From and have an at sign\n",
    "line = \"From: rjlowe@iupui.edu\"\n",
    "if re.search('^From:.+@', line): # \".\" can match any character, \"+\" means match at least one character, \"*\" means match zero to more chracters\n",
    "    print(\"line starts with From and have an at sign\")\n",
    "else:\n",
    "    print(\"line does not contain the pattern\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['csev@umich.edu', 'cwen@iupui.edu']\n"
     ]
    }
   ],
   "source": [
    "# Extract data using regular expression\n",
    "# Extracting email addresses with a non-whitespace string before @ and another after @\n",
    "# In regular expression, the pattern is \\S+@\\S+\n",
    "# \"@2PM\" does not match with this regular expression because there is no string before @\n",
    "\n",
    "line = 'A message from csev@umich.edu to cwen@iupui.edu about meeting @2PM'\n",
    "items = re.findall('\\S+@\\S+', line) #\"\\S\" means a non-whitespace character\n",
    "print(items)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['umich.edu', 'iupui.edu']\n"
     ]
    }
   ],
   "source": [
    "# Search and extract data using regular expression\n",
    "# If you are only interested in the string after @, but you need the entire pattern to find it,\n",
    "# you can add parentheses around the string using regular expression \\S+@(\\S+).\n",
    "# It will return the string after @\n",
    "line = 'A message from csev@umich.edu to cwen@iupui.edu about meeting @2PM'\n",
    "items = re.findall('\\S+@(\\S+)', line) #\"\\S\" means a non-whitespace character\n",
    "print(items)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['123abc456DEF']\n",
      "['123', '456']\n",
      "['123abc', '456DEF']\n"
     ]
    }
   ],
   "source": [
    "# common regular expressions\n",
    "# '^' - beginning of line \n",
    "# '$' - end of line\n",
    "# '.' - any character\n",
    "# '\\d' - one digit number\n",
    "# '*' - zero or more occurrences\n",
    "# '+' - one or more occurrences\n",
    "# '\\S' - non-whitespace character\n",
    "# '[a-z]' - all lowercase letters\n",
    "# '[A-Z]' - all uppercase letters\n",
    "\n",
    "line = \"123abc456DEF\"\n",
    "\n",
    "# find the entire line\n",
    "items = re.findall('^.*$', line) \n",
    "print(items)\n",
    "\n",
    "# find all numbers\n",
    "items = re.findall('(\\d+)', line) \n",
    "print(items)\n",
    "\n",
    "# find all strings that begin with one or more digits and end with one or more letters\n",
    "items = re.findall('(\\d+[a-zA-Z]+)', line) \n",
    "print(items)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Exercise"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['123abc']\n"
     ]
    }
   ],
   "source": [
    "# find all strings that begin with one or more digits and end with one or more lowercase letters\n",
    "# the answer is '123abc'\n",
    "\n",
    "line = \"123abc456DEF\"\n",
    "\n",
    "# your code starts here\n",
    "items = re.findall('(\\d+[a-z]+)', line) \n",
    "print(items)\n",
    "\n",
    "# your code ends here"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['123']\n"
     ]
    }
   ],
   "source": [
    "# find the digits at the beginning of the line\n",
    "# the answer is '123'\n",
    "\n",
    "line = \"123abc456DEF\"\n",
    "\n",
    "# your code starts here\n",
    "items = re.findall('(^\\d+)', line) \n",
    "print(items)\n",
    "\n",
    "# your code ends here\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['456']\n"
     ]
    }
   ],
   "source": [
    "# find the digits between letters\n",
    "# the answer is '456'\n",
    "\n",
    "line = \"123abc456DEF\"\n",
    "\n",
    "# your code starts here\n",
    "items = re.findall('[a-z](\\d+)', line) \n",
    "print(items)\n",
    "\n",
    "# your code ends here\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}