{"id":65,"date":"2025-04-27T06:54:41","date_gmt":"2025-04-27T06:54:41","guid":{"rendered":"https:\/\/collincheuk.com\/?page_id=65"},"modified":"2025-05-03T06:12:25","modified_gmt":"2025-05-03T06:12:25","slug":"projects-page","status":"publish","type":"page","link":"https:\/\/collincheuk.com\/?page_id=65","title":{"rendered":"Projects"},"content":{"rendered":"<div id=\"cs-content\" class=\"cs-content\"><div class=\"x-section e65-e1 m1t-0 m1t-1 m1t-2\"><div class=\"x-div x-container max width e65-e2 m1t-d m1t-e m1t-f m1t-g m1t-h m1t-i m1t-j\"><div class=\"x-text x-text-headline e65-e3 m1t-p m1t-q m1t-r m1t-s\"><div class=\"x-text-content\"><div class=\"x-text-content-text\">\n<h2 class=\"x-text-content-text-primary\">Project Showcase<\/h2><\/div><\/div><\/div><div class=\"x-text x-content e65-e4 m1t-w\"><p>The collection.<\/p><\/div><\/div><\/div><div class=\"x-section e65-e5 m1t-0 m1t-3 m1t-4\"><div class=\"x-row x-container max width e65-e6 m1t-y m1t-z m1t-11 m1t-12\"><div class=\"x-row-inner\"><div class=\"x-col e65-e7 m1t-18\"><div class=\"x-row e65-e8 m1t-y m1t-10 m1t-13\"><div class=\"x-row-inner\"><div class=\"x-col e65-e9 m1t-18\"><div class=\"x-text x-text-headline has-graphic e65-e10 m1t-q m1t-r m1t-t\"><div class=\"x-text-content\"><span class=\"x-graphic\" aria-hidden=\"true\"><i class=\"x-icon x-graphic-child x-graphic-icon x-graphic-primary\" aria-hidden=\"true\" data-x-icon-o=\"&#xf00c;\"><\/i><\/span><div class=\"x-text-content-text\"><h3 class=\"x-text-content-text-primary\"><strong>Deep Learning NSFW Image Classifier<\/strong><\/h3>\n<p class=\"x-text-content-text-subheadline\">Deep Learning | Computer Vision<\/p><\/div><\/div><\/div><a class=\"x-anchor x-anchor-button has-graphic has-particle e65-e11 m1t-1a m1t-9\" tabindex=\"0\" href=\"#deep\" data-x-effect=\"{&quot;durationBase&quot;:&quot;300ms&quot;}\"><span class=\"x-particle is-primary\" data-x-particle=\"scale-x inside-b_l\" aria-hidden=\"true\"><span style=\"\"><\/span><\/span><div class=\"x-anchor-content\"><span class=\"x-graphic\" aria-hidden=\"true\"><i class=\"x-icon x-graphic-child x-graphic-icon x-graphic-primary\" aria-hidden=\"true\" data-x-icon-o=\"&#xf105;\"><\/i><i class=\"x-icon x-graphic-child x-graphic-icon x-graphic-secondary\" aria-hidden=\"true\" data-x-icon-o=\"&#xf105;\"><\/i><\/span><div class=\"x-anchor-text\"><span class=\"x-anchor-text-primary\">Learn More<\/span><\/div><\/div><\/a><\/div><div class=\"x-col e65-e12 m1t-18\"><div class=\"x-text x-text-headline has-graphic e65-e13 m1t-q m1t-r m1t-t\"><div class=\"x-text-content\"><span class=\"x-graphic\" aria-hidden=\"true\"><i class=\"x-icon x-graphic-child x-graphic-icon x-graphic-primary\" aria-hidden=\"true\" data-x-icon-o=\"&#xf00c;\"><\/i><\/span><div class=\"x-text-content-text\"><h3 class=\"x-text-content-text-primary\"><strong>SMS Spam Detection Classifier<\/strong><\/h3>\n<p class=\"x-text-content-text-subheadline\">NLP | Machine Learning | Text Classification<\/p><\/div><\/div><\/div><a class=\"x-anchor x-anchor-button has-graphic has-particle e65-e14 m1t-1a m1t-9\" tabindex=\"0\" href=\"#sms\" data-x-effect=\"{&quot;durationBase&quot;:&quot;300ms&quot;}\"><span class=\"x-particle is-primary\" data-x-particle=\"scale-x inside-b_l\" aria-hidden=\"true\"><span style=\"\"><\/span><\/span><div class=\"x-anchor-content\"><span class=\"x-graphic\" aria-hidden=\"true\"><i class=\"x-icon x-graphic-child x-graphic-icon x-graphic-primary\" aria-hidden=\"true\" data-x-icon-o=\"&#xf105;\"><\/i><i class=\"x-icon x-graphic-child x-graphic-icon x-graphic-secondary\" aria-hidden=\"true\" data-x-icon-o=\"&#xf105;\"><\/i><\/span><div class=\"x-anchor-text\"><span class=\"x-anchor-text-primary\">Learn More<\/span><\/div><\/div><\/a><\/div><div class=\"x-col e65-e15 m1t-18\"><div class=\"x-text x-text-headline has-graphic e65-e16 m1t-q m1t-r m1t-t\"><div class=\"x-text-content\"><span class=\"x-graphic\" aria-hidden=\"true\"><i class=\"x-icon x-graphic-child x-graphic-icon x-graphic-primary\" aria-hidden=\"true\" data-x-icon-o=\"&#xf00c;\"><\/i><\/span><div class=\"x-text-content-text\"><h3 class=\"x-text-content-text-primary\"><strong>IMDB Movie Review Sentiment Classifier<\/strong><\/h3>\n<p class=\"x-text-content-text-subheadline\">NLP | Deep Learning |&nbsp; Sentiment Analysis | Text Classification<\/p><\/div><\/div><\/div><a class=\"x-anchor x-anchor-button has-graphic has-particle e65-e17 m1t-1a m1t-9\" tabindex=\"0\" href=\"#imdb\" data-x-effect=\"{&quot;durationBase&quot;:&quot;300ms&quot;}\"><span class=\"x-particle is-primary\" data-x-particle=\"scale-x inside-b_l\" aria-hidden=\"true\"><span style=\"\"><\/span><\/span><div class=\"x-anchor-content\"><span class=\"x-graphic\" aria-hidden=\"true\"><i class=\"x-icon x-graphic-child x-graphic-icon x-graphic-primary\" aria-hidden=\"true\" data-x-icon-o=\"&#xf105;\"><\/i><i class=\"x-icon x-graphic-child x-graphic-icon x-graphic-secondary\" aria-hidden=\"true\" data-x-icon-o=\"&#xf105;\"><\/i><\/span><div class=\"x-anchor-text\"><span class=\"x-anchor-text-primary\">Learn More<\/span><\/div><\/div><\/a><\/div><div class=\"x-col e65-e18 m1t-18\"><div class=\"x-text x-text-headline has-graphic e65-e19 m1t-q m1t-r m1t-t\"><div class=\"x-text-content\"><span class=\"x-graphic\" aria-hidden=\"true\"><i class=\"x-icon x-graphic-child x-graphic-icon x-graphic-primary\" aria-hidden=\"true\" data-x-icon-o=\"&#xf00c;\"><\/i><\/span><div class=\"x-text-content-text\"><h3 class=\"x-text-content-text-primary\"><strong>Reddit Post Authorship &amp; Behavioral Analysis<\/strong><\/h3>\n<p class=\"x-text-content-text-subheadline\">NLP | Reddit API | Data Visualization | Text Classification&nbsp;<\/p><\/div><\/div><\/div><a class=\"x-anchor x-anchor-button has-graphic has-particle e65-e20 m1t-1a m1t-9\" tabindex=\"0\" href=\"#reddit\" data-x-effect=\"{&quot;durationBase&quot;:&quot;300ms&quot;}\"><span class=\"x-particle is-primary\" data-x-particle=\"scale-x inside-b_l\" aria-hidden=\"true\"><span style=\"\"><\/span><\/span><div class=\"x-anchor-content\"><span class=\"x-graphic\" aria-hidden=\"true\"><i class=\"x-icon x-graphic-child x-graphic-icon x-graphic-primary\" aria-hidden=\"true\" data-x-icon-o=\"&#xf105;\"><\/i><i class=\"x-icon x-graphic-child x-graphic-icon x-graphic-secondary\" aria-hidden=\"true\" data-x-icon-o=\"&#xf105;\"><\/i><\/span><div class=\"x-anchor-text\"><span class=\"x-anchor-text-primary\">Learn More<\/span><\/div><\/div><\/a><\/div><div class=\"x-col e65-e21 m1t-18\"><div class=\"x-text x-text-headline has-graphic e65-e22 m1t-q m1t-r m1t-t\"><div class=\"x-text-content\"><span class=\"x-graphic\" aria-hidden=\"true\"><i class=\"x-icon x-graphic-child x-graphic-icon x-graphic-primary\" aria-hidden=\"true\" data-x-icon-o=\"&#xf00c;\"><\/i><\/span><div class=\"x-text-content-text\"><h3 class=\"x-text-content-text-primary\"><strong>Salmon Weight Prediction with Support Vector Regression<\/strong><\/h3>\n<p class=\"x-text-content-text-subheadline\">Machine Learning | Regression | Data Cleaning<\/p><\/div><\/div><\/div><a class=\"x-anchor x-anchor-button has-graphic has-particle e65-e23 m1t-1a m1t-9\" tabindex=\"0\" href=\"#salmon\" data-x-effect=\"{&quot;durationBase&quot;:&quot;300ms&quot;}\"><span class=\"x-particle is-primary\" data-x-particle=\"scale-x inside-b_l\" aria-hidden=\"true\"><span style=\"\"><\/span><\/span><div class=\"x-anchor-content\"><span class=\"x-graphic\" aria-hidden=\"true\"><i class=\"x-icon x-graphic-child x-graphic-icon x-graphic-primary\" aria-hidden=\"true\" data-x-icon-o=\"&#xf105;\"><\/i><i class=\"x-icon x-graphic-child x-graphic-icon x-graphic-secondary\" aria-hidden=\"true\" data-x-icon-o=\"&#xf105;\"><\/i><\/span><div class=\"x-anchor-text\"><span class=\"x-anchor-text-primary\">Learn More<\/span><\/div><\/div><\/a><\/div><\/div><\/div><\/div><\/div><\/div><\/div><div class=\"x-section e65-e24 m1t-0 m1t-2 m1t-4 m1t-5 m1t-6\"><div class=\"x-div x-container max width e65-e25 m1t-d m1t-e m1t-f m1t-g m1t-h m1t-i m1t-j\"><div class=\"x-text x-text-headline e65-e26 m1t-p m1t-q m1t-r m1t-u\"><div class=\"x-text-content\"><div class=\"x-text-content-text\">\n<h2 class=\"x-text-content-text-primary\"><strong id=\"deep\">Deep Learning NSFW Image Classifier<\/strong><\/h2><\/div><\/div><\/div><div class=\"x-text x-content e65-e27 m1t-w m1t-x\"><p><strong>Project Type:<\/strong> Image classification using TensorFlow and deep learning<br \/><strong>Type:<\/strong> Deep Learning | Computer Vision<br \/><strong>Tools:<\/strong> Python | TensorFlow | Keras | OpenCV | NumPy | Reddit API | Jupyter | GitHub<\/p>\n<p><strong>Overview:<\/strong><br \/>Developed a deep learning model to detect and classify unsolicited explicit images (commonly known as &ldquo;dick pics&rdquo;) to promote safer digital environments. The model distinguishes between explicit and non-explicit content with <strong>99.26% accuracy<\/strong> using a custom-built image dataset.<\/p>\n<p><strong>Key Technologies:<\/strong><\/p>\n<ul>\n<li><strong>Languages &amp; Frameworks:<\/strong> Python, TensorFlow, Keras, Scikit-learn<\/li>\n<li><strong>Libraries &amp; Tools:<\/strong> OpenCV (image processing), Pandas, NumPy, Matplotlib, PRAW (Reddit API), Jupyter Notebook, VS Code<\/li>\n<li><strong>OS &amp; Virtualization:<\/strong> Windows 10\/11, Linux Mint (via VMware), Anaconda Navigator<\/li>\n<li><strong>Version Control:<\/strong> GitHub<\/li>\n<\/ul>\n<p><strong>Process Highlights:<\/strong><\/p>\n<ul>\n<li><strong>Data Collection &amp; Preprocessing:<\/strong><\/li>\n<ul>\n<li>Manually gathered and web-scraped over <strong>250K unique images<\/strong><\/li>\n<li>Removed duplicates, resized images, maintained aspect ratios, labeled data<\/li>\n<\/ul>\n<li><strong>Model Architecture:<\/strong><\/li>\n<ul>\n<li><strong>5 Conv2D layers<\/strong> &rarr; <strong>Flatten<\/strong> &rarr; <strong>6 Dense layers<\/strong><\/li>\n<li>Final output layer uses sigmoid for binary classification<\/li>\n<li>Trained using binary cross-entropy loss, Adam optimizer<\/li>\n<\/ul>\n<li><strong>Training &amp; Evaluation:<\/strong><\/li>\n<ul>\n<li>Dataset split: 50% training, 25% validation, 25% testing<\/li>\n<li>Batch size: 128, Epochs: 35<\/li>\n<li>Achieved <strong>99.26% accuracy<\/strong> and <strong>0.0213 loss<\/strong> on test set (56K+ samples)<\/li>\n<\/ul>\n<\/ul>\n<p><strong>Impact:<\/strong><br \/>Successfully built a high-accuracy model capable of flagging explicit content, which could be integrated into social media platforms or messaging apps to improve user safety and content moderation.<\/p><\/div><\/div><\/div><div class=\"x-section e65-e28 m1t-0 m1t-2 m1t-4 m1t-6 m1t-7\"><div class=\"x-div e65-e29 m1t-d m1t-e m1t-f m1t-g m1t-h m1t-i m1t-j\"><div class=\"x-text x-text-headline e65-e30 m1t-p m1t-r m1t-v\"><div class=\"x-text-content\"><div class=\"x-text-content-text\">\n<h2 class=\"x-text-content-text-primary\">Gallery<\/h2><\/div><\/div><\/div><div class=\"x-div e65-e31 m1t-f m1t-h m1t-k m1t-l\" data-x-slide-context=\"\"><div class=\"x-div e65-e32 m1t-f m1t-g m1t-h m1t-i\"><div class=\"x-slide-container-viewport is-loading e65-e33 m1t-1b\" data-x-slide-container=\"{&quot;keyboardNavigation&quot;:true,&quot;snap&quot;:true,&quot;wrapAround&quot;:true,&quot;int&quot;:&quot;click drag off&quot;,&quot;direction&quot;:&quot;forward&quot;,&quot;startingSlide&quot;:1}\"><div class=\"x-slide-container-content\"><div class=\"x-slide-container is-inline is-paged\"><div class=\"x-slide e65-e34 m1t-1c\" data-x-slide=\"\"><span class=\"x-image e65-e35 m1t-1d\"><img decoding=\"async\" src=\"https:\/\/collincheuk.com\/wp-content\/uploads\/2025\/04\/DPC-flow-plan1-Page-3.drawio.png\" width=\"354\" height=\"143\" alt=\"Placeholder Image\" loading=\"lazy\"><\/span><\/div><div class=\"x-slide e65-e36 m1t-1c\" data-x-slide=\"\"><span class=\"x-image e65-e37 m1t-1d\"><img decoding=\"async\" src=\"https:\/\/collincheuk.com\/wp-content\/uploads\/2025\/04\/final-report-35-page.png\" width=\"880\" height=\"413\" alt=\"Placeholder Image\" loading=\"lazy\"><\/span><\/div><div class=\"x-slide e65-e38 m1t-1c\" data-x-slide=\"\"><span class=\"x-image e65-e39 m1t-1d\"><img decoding=\"async\" src=\"https:\/\/collincheuk.com\/wp-content\/uploads\/2025\/04\/DPC-flow-plan1-Page-4.drawio.png\" width=\"229\" height=\"114\" alt=\"Placeholder Image\" loading=\"lazy\"><\/span><\/div><div class=\"x-slide e65-e40 m1t-1c\" data-x-slide=\"\"><span class=\"x-image e65-e41 m1t-1d m1t-a\"><img decoding=\"async\" src=\"https:\/\/collincheuk.com\/wp-content\/uploads\/2025\/04\/DPC-flow-plan1-Crop-worflow.drawio.png\" width=\"610\" height=\"474\" alt=\"Placeholder Image\" loading=\"lazy\"><\/span><\/div><div class=\"x-slide e65-e42 m1t-1c\" data-x-slide=\"\"><span class=\"x-image e65-e43 m1t-1d m1t-1e\"><img decoding=\"async\" src=\"https:\/\/collincheuk.com\/wp-content\/uploads\/2025\/05\/Picture6.jpg\" width=\"2221\" height=\"1361\" alt=\"Placeholder Image\" loading=\"lazy\"><\/span><\/div><\/div><\/div><\/div><div class=\"x-div e65-e44 m1t-d m1t-f m1t-g m1t-h m1t-m m1t-b\"><a class=\"x-div e65-e45 m1t-d m1t-e m1t-f m1t-g m1t-i m1t-n m1t-9 m1t-c\" href=\"#prev\" data-x-effect-provider=\"colors particles effects\" data-x-effect=\"{&quot;durationBase&quot;:&quot;300ms&quot;}\" data-x-slide-prev=\"\"><i class=\"x-icon e65-e46 m1t-1f m1t-1g\" aria-hidden=\"true\" data-x-icon-o=\"&#xf104;\"><\/i><\/a><a class=\"x-div e65-e47 m1t-d m1t-e m1t-f m1t-i m1t-l m1t-n m1t-o m1t-9 m1t-c\" href=\"#next\" data-x-effect-provider=\"colors particles effects\" data-x-effect=\"{&quot;durationBase&quot;:&quot;300ms&quot;}\" data-x-slide-next=\"\"><i class=\"x-icon e65-e48 m1t-1f m1t-1h\" aria-hidden=\"true\" data-x-icon-o=\"&#xf105;\"><\/i><\/a><\/div><\/div><ul class=\"x-slide-pagination is-row e65-e49 m1t-1i\" data-x-slide-pagination=\"\"><\/ul><\/div><\/div><\/div><div class=\"x-section e65-e50 m1t-0 m1t-3 m1t-4\"><div class=\"x-row x-container max width e65-e51 m1t-y m1t-z m1t-11 m1t-14\"><div class=\"x-row-inner\"><div class=\"x-col e65-e52 m1t-18\"><div class=\"x-div x-container max width e65-e53 m1t-d m1t-e m1t-f m1t-g m1t-h m1t-i m1t-j\"><div class=\"x-text x-text-headline e65-e54 m1t-p m1t-q m1t-r m1t-u\"><div class=\"x-text-content\"><div class=\"x-text-content-text\">\n<h2 class=\"x-text-content-text-primary\"><strong id=\"sms\">SMS Spam Detection Classifier<\/strong><\/h2><\/div><\/div><\/div><div class=\"x-text x-content e65-e55 m1t-w m1t-x\"><p><strong>Project Type:<\/strong> Image classification using TensorFlow and deep learning<br \/><strong>Type:<\/strong> NLP | Machine Learning | Text Classification<br \/><strong>Tools:<\/strong> Python | Scikit-learn | TF-IDF | Naive Bayes | Logistic Regression<\/p>\n<p><strong>GitHub<\/strong>: <a href=\"https:\/\/github.com\/birdmoney11\/portfolio-samples\/tree\/main\/NLP%20SMS\">https:\/\/github.com\/birdmoney11\/portfolio-samples\/tree\/main\/NLP%20SMS<\/a><\/p>\n<p><strong>Overview:<\/strong><\/p>\n<p>Developed a natural language processing (NLP) pipeline to automatically classify SMS messages as <em>spam<\/em> or <em>ham (not spam)<\/em> using classic machine learning models and vectorization techniques. Achieved a <strong>97% test accuracy<\/strong> using a <strong>Naive Bayes classifier<\/strong>, and cross-verified with a Logistic Regression model.<\/p>\n<p><strong>Problem &amp; Motivation<\/strong><\/p>\n<p>With SMS marketing growing at over <strong>20% CAGR<\/strong> worldwide, filtering spam is increasingly critical. This project aims to implement a scalable spam detection system based on publicly available SMS datasets and common NLP techniques.<\/p>\n<p><strong>Dataset<\/strong><\/p>\n<ul>\n<li><strong>Source<\/strong>: <a href=\"https:\/\/archive.ics.uci.edu\/dataset\/228\/sms+spam+collection\">UCI SMS Spam Collection<\/a><\/li>\n<li><strong>Samples<\/strong>: 5,574 SMS messages (747 spam \/ 4,827 ham)<\/li>\n<li><strong>Languages<\/strong>: English (UK &amp; Singapore regional differences noted)<\/li>\n<li><strong>Challenge<\/strong>: Linguistic and vocabulary variation created natural biases in the dataset, which were mitigated through preprocessing and normalization.<\/li>\n<\/ul>\n<p><strong>Tools &amp; Technologies<\/strong><\/p>\n<ul>\n<li><strong>Languages<\/strong>: Python<\/li>\n<li><strong>Libraries<\/strong>: Scikit-learn, NLTK, Pandas, NumPy, Seaborn, Matplotlib<\/li>\n<li><strong>Models<\/strong>: Multinomial Naive Bayes, Logistic Regression<\/li>\n<li><strong>Vectorization<\/strong>: Bag of Words, <strong>TF-IDF<\/strong>, 1&ndash;3 gram range<\/li>\n<li><strong>Preprocessing<\/strong>: HTML unescape, punctuation\/stopword removal, lemmatization (WordNet)<\/li>\n<li><strong>Evaluation<\/strong>: Confusion Matrix, Classification Report, Heatmaps<\/li>\n<\/ul>\n<p><strong>Implementation Workflow<\/strong><\/p>\n<ol>\n<li><strong>Data Cleaning<\/strong>: HTML decoding, noise removal, lowercasing<\/li>\n<li><strong>Tokenization &amp; Lemmatization<\/strong>: Preprocessed into root words for better semantic matching<\/li>\n<li><strong>Feature Extraction<\/strong>:<\/li>\n<ul>\n<li>CountVectorizer (BoW)<\/li>\n<li>TF-IDF with unigrams, bigrams, and trigrams (max 2,500 features)<\/li>\n<\/ul>\n<li><strong>Model Training &amp; Testing<\/strong>:<\/li>\n<ul>\n<li>Stratified 75\/25 train-test split<\/li>\n<li>Applied both Naive Bayes and Logistic Regression for comparative analysis<\/li>\n<\/ul>\n<\/ol>\n<p><strong>Results<\/strong><\/p>\n<p><strong>Naive Bayes Classifier:<\/strong><\/p>\n<ul>\n<li><strong>Test Accuracy<\/strong>: 97%<\/li>\n<li><strong>Spam Precision<\/strong>: 0.77<\/li>\n<li><strong>Spam Recall<\/strong>: <strong>0.98<\/strong><\/li>\n<li><strong>F1-Score<\/strong>: 0.86<\/li>\n<\/ul>\n<p><strong>Logistic Regression:<\/strong><\/p>\n<ul>\n<li><strong>Test Accuracy<\/strong>: 96%<\/li>\n<li>Slightly lower spam precision than Naive Bayes<\/li>\n<\/ul>\n<p>Both models performed well, but Naive Bayes showed higher robustness in detecting spam with fewer false negatives&mdash;critical for spam detection applications.<\/p>\n<p>&nbsp;<\/p><\/div><\/div><\/div><\/div><\/div><\/div><div class=\"x-section e65-e56 m1t-0 m1t-3 m1t-4 m1t-8\"><div class=\"x-row x-container max width e65-e57 m1t-y m1t-z m1t-11 m1t-15\"><div class=\"x-row-inner\"><div class=\"x-col e65-e58 m1t-18\"><div class=\"x-div x-container max width e65-e59 m1t-d m1t-e m1t-f m1t-g m1t-h m1t-i m1t-j\"><div class=\"x-text x-text-headline e65-e60 m1t-p m1t-q m1t-r m1t-u\"><div class=\"x-text-content\"><div class=\"x-text-content-text\">\n<h2 class=\"x-text-content-text-primary\"><strong id=\"imdb\">IMDB Movie Review Sentiment Classifier<\/strong><\/h2><\/div><\/div><\/div><div class=\"x-text x-content e65-e61 m1t-w m1t-x\"><p><strong>Project Type:<\/strong> Sentiment analysis using TensorFlow and deep learning<br \/><strong>Type: <\/strong>NLP | Deep Learning |&nbsp; Sentiment Analysis | Text Classification<strong><br \/><\/strong><strong>Tools:<\/strong> Python | TensorFlow | Keras&nbsp;<\/p>\n<p><strong>GitHub<\/strong>:&nbsp;<a href=\"https:\/\/github.com\/birdmoney11\/portfolio-samples\/tree\/main\/ML%20IMDB\">https:\/\/github.com\/birdmoney11\/portfolio-samples\/tree\/main\/ML%20IMDB<\/a><\/p>\n<p><strong>Overview:<\/strong><\/p>\n<p>A binary classification project using deep learning to determine sentiment polarity (positive or negative) of movie reviews from the IMDB dataset.<\/p>\n<p><strong>&nbsp;Problem &amp; Dataset<\/strong><\/p>\n<ul>\n<li><strong>Task<\/strong>: Classify movie reviews as either <em>positive<\/em> (&ge;7\/10) or <em>negative<\/em> (&le;4\/10) to avoid ambiguity from neutral scores.<\/li>\n<li><strong>Dataset<\/strong>: IMDB dataset from keras.datasets, with <strong>50,000 pre-tokenized reviews<\/strong>, split evenly into training and test sets. Each review is encoded using a Bag-of-Words (BoW) technique.<\/li>\n<\/ul>\n<p><strong>&nbsp;Model Architecture<\/strong><\/p>\n<ul>\n<li>Built with <strong>Keras Sequential API<\/strong>, the model comprises:<\/li>\n<ul>\n<li><strong>Embedding layer<\/strong> for word vector representation<\/li>\n<li><strong>GlobalAveragePooling1D<\/strong> to flatten embeddings<\/li>\n<li><strong>Dense layers<\/strong> with ReLU activation<\/li>\n<li><strong>Sigmoid output<\/strong> for binary classification<\/li>\n<\/ul>\n<\/ul>\n<p><strong>Tools &amp; Technologies<\/strong><\/p>\n<ul>\n<li><strong>Languages<\/strong>: Python<\/li>\n<li><strong>Libraries<\/strong>: TensorFlow, Keras, NumPy, Pandas, Matplotlib, Seaborn<\/li>\n<li><strong>IDE<\/strong>: Jupyter Notebook, VS Code<\/li>\n<li><strong>Evaluation Tools<\/strong>: Confusion Matrix, Classification Report (precision, recall, f1-score)<\/li>\n<\/ul>\n<p><strong>Evaluation &amp; Results<\/strong><\/p>\n<ul>\n<li><strong>Accuracy<\/strong>: <strong>87.52%<\/strong><\/li>\n<li><strong>Loss<\/strong>: 0.3723<\/li>\n<li><strong>Metrics<\/strong>:<\/li>\n<ul>\n<li>Positive review recall: 0.90<\/li>\n<li>Negative review precision: 0.89<\/li>\n<li>Balanced F1-scores (~0.88) indicate strong performance for both classes<\/li>\n<\/ul>\n<\/ul>\n<p><strong>Key Features &amp; Highlights<\/strong><\/p>\n<ul>\n<li>Applied <strong>Hold-Out Validation<\/strong> to split data into training and validation subsets.<\/li>\n<li>Used <strong>classification metrics and visualization<\/strong> (confusion matrix, precision-recall plots) for in-depth model evaluation.<\/li>\n<li>Demonstrated ability to <strong>preprocess data<\/strong>, <strong>build text pipelines<\/strong>, and <strong>tune neural networks<\/strong>.<\/li>\n<\/ul><\/div><\/div><\/div><\/div><\/div><\/div><div class=\"x-section e65-e62 m1t-0 m1t-3 m1t-4\"><div class=\"x-row x-container max width e65-e63 m1t-y m1t-z m1t-11 m1t-16\"><div class=\"x-row-inner\"><div class=\"x-col e65-e64 m1t-18\"><div class=\"x-div x-container max width e65-e65 m1t-d m1t-e m1t-f m1t-g m1t-h m1t-i m1t-j\"><div class=\"x-text x-text-headline e65-e66 m1t-p m1t-q m1t-r m1t-u\"><div class=\"x-text-content\"><div class=\"x-text-content-text\">\n<h2 class=\"x-text-content-text-primary\"><strong id=\"reddit\">Reddit Post Authorship &amp; Behavioral Analysis<\/strong><\/h2><\/div><\/div><\/div><div class=\"x-text x-content e65-e67 m1t-w m1t-x\"><p><strong>Project Type:<\/strong> NLP and Data Visualization <br \/><strong>Type: <\/strong>NLP | Reddit API | Data Visualization | Text Classification&nbsp;<strong><br \/><\/strong><strong>Tools:<\/strong> Python | Pandas | Matplotlib | Seaborn<\/p>\n<p><strong>GitHub<\/strong>:&nbsp;<a href=\"https:\/\/github.com\/birdmoney11\/portfolio-samples\/tree\/main\/DS%20Reddit%20post\">https:\/\/github.com\/birdmoney11\/portfolio-samples\/tree\/main\/DS%20Reddit%20post<\/a><\/p>\n<p><strong>Project Objective<\/strong><\/p>\n<p>Analyzed the writing style, vocabulary, and activity patterns of two Reddit users&mdash;Shittymorph (SM) and GuyWithFacts (GWF)&mdash;to evaluate the hypothesis: <em>Are these accounts operated by the same person?<\/em><br \/>This project combines <strong>natural language processing (NLP)<\/strong>, <strong>behavioral analysis<\/strong>, and <strong>custom visualization<\/strong> to explore authorship attribution in real-world online forums.<\/p>\n<p><strong>Tools &amp; Technologies<\/strong><\/p>\n<ul>\n<li><strong>Languages &amp; Libraries<\/strong>: Python, Pandas, NumPy, Seaborn, Matplotlib<\/li>\n<li><strong>APIs<\/strong>: PRAW (Python Reddit API Wrapper)<\/li>\n<li><strong>NLP Techniques<\/strong>: Tokenization, stopword removal, custom regex filtering<\/li>\n<li><strong>IDE<\/strong>: Jupyter Notebook<\/li>\n<\/ul>\n<p><strong>Data Collection &amp; Preprocessing<\/strong><\/p>\n<ul>\n<li>Fetched Reddit comments using <strong>PRAW<\/strong> for both SM and GWF accounts<\/li>\n<li>Removed repeated quotes and &ldquo;copypasta&rdquo; (e.g., Undertaker meme lines) using regex and str.replace()<\/li>\n<li>Applied lowercasing, punctuation removal, and stopword filtering<\/li>\n<li>Created separate labeled datasets for side-by-side analysis<\/li>\n<\/ul>\n<p><strong>Feature Engineering &amp; Analysis<\/strong><\/p>\n<ul>\n<li><strong>Word Cloud Visualizations<\/strong>: Captured vocabulary richness and dominant word themes<\/li>\n<li><strong>Comment Length Histograms<\/strong>: Showed SM preferred short posts (peaking at 50&ndash;100 words), while GWF leaned toward long-form replies (100&ndash;300+ words)<\/li>\n<li><strong>Hourly Activity Heatmaps<\/strong>: Compared time-of-day posting patterns across users<\/li>\n<li><strong>Lexical Analysis<\/strong>: Identified differences in tone, sentiment, and topic focus<\/li>\n<\/ul>\n<p><strong>Key Results<\/strong><\/p>\n<ul>\n<li><strong>Distinct Posting Behavior<\/strong>: SM used more informal, meme-driven language; GWF&rsquo;s posts were longer and more introspective<\/li>\n<li><strong>Unique Vocabularies<\/strong>: Clear divergence in top-used words&mdash;no major overlap<\/li>\n<li><strong>Time-of-Day Activity<\/strong>: Posting hours differed, reinforcing behavioral distinctions<\/li>\n<\/ul>\n<p><strong>Conclusion<\/strong><\/p>\n<p>The evidence strongly suggests <strong>SM and GWF are separate individuals<\/strong>. This analysis successfully applied NLP, real-world API data scraping, and visual storytelling to approach a forensic linguistics-style question.<\/p>\n<p><strong>Skills Demonstrated<\/strong><\/p>\n<ul>\n<li>Natural Language Processing (NLP)<\/li>\n<li>Data Wrangling &amp; Cleaning<\/li>\n<li>API Integration (PRAW)<\/li>\n<li>Custom Text Preprocessing with Regex<\/li>\n<li>Data Visualization &amp; Exploratory Analysis<\/li>\n<li>Behavioral Pattern Recognition<\/li>\n<\/ul><\/div><\/div><\/div><\/div><\/div><\/div><div class=\"x-section e65-e68 m1t-0 m1t-3 m1t-4 m1t-8\"><div class=\"x-row x-container max width e65-e69 m1t-y m1t-z m1t-11 m1t-17\"><div class=\"x-row-inner\"><div class=\"x-col e65-e70 m1t-18 m1t-19\"><div class=\"x-div x-container max width e65-e71 m1t-d m1t-e m1t-f m1t-g m1t-h m1t-i m1t-j\"><div class=\"x-text x-text-headline e65-e72 m1t-p m1t-q m1t-r m1t-u\"><div class=\"x-text-content\"><div class=\"x-text-content-text\">\n<h2 class=\"x-text-content-text-primary\"><strong id=\"salmon\">Salmon Weight Prediction with Support Vector Regression<\/strong><\/h2><\/div><\/div><\/div><div class=\"x-text x-content e65-e73 m1t-w m1t-x\"><p><strong>Project Type:<\/strong> Sentiment analysis using TensorFlow and deep learning<br \/><strong>Type: <\/strong>Machine Learning | Regression | Data Cleaning <br \/><strong>Tools:<\/strong> Python | Scikit-learn | NumPy | Pandas | Matplotlib | Seaborn<\/p>\n<p><strong>GitHub<\/strong>: <a href=\"https:\/\/github.com\/birdmoney11\/portfolio-samples\/tree\/main\/DS%20Salmon%20size\">https:\/\/github.com\/birdmoney11\/portfolio-samples\/tree\/main\/DS%20Salmon%20size<\/a><\/p>\n<p><strong>Project Overview<\/strong><\/p>\n<p>This project builds a regression-based machine learning model to predict the <strong>weight of Pacific salmon<\/strong> based solely on their <strong>length<\/strong>, enabling potential real-time applications in <strong>ecological monitoring<\/strong> and <strong>AI-powered species recognition systems<\/strong>.<\/p>\n<p><strong>Dataset<\/strong><\/p>\n<ul>\n<li><strong>Source<\/strong>: Alaska Salmon Survey data (via Kaggle)<\/li>\n<li><strong>Size<\/strong>: ~14 million records (filtered to ~677,000 valid samples)<\/li>\n<li><strong>Species<\/strong>: Chinook, Chum, Coho, Pink, Sockeye<\/li>\n<li><strong>Key Features<\/strong>: Length (input), Weight (target variable)<\/li>\n<li><strong>Use Case<\/strong>: Simulates AI applications in wildlife management like <strong>Norway&rsquo;s invasive species detection system<\/strong><\/li>\n<\/ul>\n<p><strong>Methodology<\/strong><\/p>\n<ol>\n<li><strong>Data Cleaning<\/strong><\/li>\n<ul>\n<li>Removed nulls, duplicates, and inconsistent records<\/li>\n<li>Focused on biologically valid combinations (e.g., Chinook species with realistic weight-length pairs)<\/li>\n<\/ul>\n<li><strong>Exploratory Data Analysis<\/strong><\/li>\n<ul>\n<li>Found correlations between age, gender, and weight<\/li>\n<li>Verified statistical distributions across species<\/li>\n<\/ul>\n<li><strong>Modeling: Support Vector Regression (SVR)<\/strong>\n<ul>\n<li><strong>R&sup2; Score<\/strong><\/li>\n<li><strong>Mean Absolute Error (MAE)<\/strong><\/li>\n<li><strong>Root Mean Square Error (RMSE)<\/strong><\/li>\n<\/ul>\n<\/li>\n<ul>\n<li>Used scikit-learn's SVR() with RBF kernel<\/li>\n<li>Input: Salmon length<\/li>\n<li>Target: Weight<\/li>\n<li>Train\/test split: 80\/20<\/li>\n<li>Performance evaluated using:<\/li>\n<\/ul>\n<li><strong>Visualization<\/strong><\/li>\n<ul>\n<li>Created 2x2 subplots showing prediction vs. actual values<\/li>\n<li>Compared multiple models (SVR #1 vs SVR #2) for robustness across different data subsets<\/li>\n<\/ul>\n<\/ol>\n<p><strong>Results &amp; Insights<\/strong><\/p>\n<ul>\n<li>SVR predicted weight values that closely followed the actual regression curve<\/li>\n<li><strong>Model #2<\/strong> performed better at higher-length values<\/li>\n<li><strong>Key Insight<\/strong>: With high-quality, cleaned data, simple ML models like SVR can yield highly accurate predictions<\/li>\n<li><strong>Limitation<\/strong>: Highly processed data may inflate apparent accuracy&mdash;future work could explore raw data performance<\/li>\n<\/ul>\n<p><strong>Real-World Relevance<\/strong><\/p>\n<ul>\n<li>Enables <strong>non-invasive, real-time predictions<\/strong> of fish weight using only visual input<\/li>\n<li>Could power <strong>camera-based AI<\/strong> systems to assist in species detection, invasive species filtering, or population studies<\/li>\n<li>Highlights use of <strong>ML in marine biology, sustainability, and automation<\/strong><\/li>\n<\/ul>\n<p><strong>Tools &amp; Technologies<\/strong><\/p>\n<ul>\n<li>Python, NumPy, Pandas, Matplotlib, Seaborn<\/li>\n<li>Scikit-learn (SVR, model selection, metrics)<\/li>\n<li>Jupyter Notebook, Visual Studio Code<\/li>\n<\/ul><\/div><\/div><\/div><\/div><\/div><\/div><\/div>\n","protected":false},"excerpt":{"rendered":"<p>Project ShowcaseThe collection.Deep Learning NSFW Image Classifier Deep Learning | Computer VisionLearn MoreSMS Spam Detection Classifier NLP | Machine Learning | Text ClassificationLearn MoreIMDB Movie Review Sentiment Classifier NLP | Deep Learning |&nbsp; Sentiment Analysis | Text ClassificationLearn MoreReddit Post Authorship &amp; Behavioral Analysis NLP | Reddit API | Data Visualization | Text Classification&nbsp;Learn MoreSalmon Weight Prediction with Support Vector &#8230; <\/p>\n<div><a href=\"https:\/\/collincheuk.com\/?page_id=65\" class=\"more-link\">Read More<\/a><\/div>\n","protected":false},"author":1,"featured_media":0,"parent":42,"menu_order":0,"comment_status":"closed","ping_status":"closed","template":"template-blank-4.php","meta":{"footnotes":""},"class_list":["post-65","page","type-page","status-publish","hentry","no-post-thumbnail"],"_links":{"self":[{"href":"https:\/\/collincheuk.com\/index.php?rest_route=\/wp\/v2\/pages\/65","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/collincheuk.com\/index.php?rest_route=\/wp\/v2\/pages"}],"about":[{"href":"https:\/\/collincheuk.com\/index.php?rest_route=\/wp\/v2\/types\/page"}],"author":[{"embeddable":true,"href":"https:\/\/collincheuk.com\/index.php?rest_route=\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/collincheuk.com\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=65"}],"version-history":[{"count":13,"href":"https:\/\/collincheuk.com\/index.php?rest_route=\/wp\/v2\/pages\/65\/revisions"}],"predecessor-version":[{"id":130,"href":"https:\/\/collincheuk.com\/index.php?rest_route=\/wp\/v2\/pages\/65\/revisions\/130"}],"up":[{"embeddable":true,"href":"https:\/\/collincheuk.com\/index.php?rest_route=\/wp\/v2\/pages\/42"}],"wp:attachment":[{"href":"https:\/\/collincheuk.com\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=65"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}